summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/ndisc.c2
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c11
-rw-r--r--net/802/hippi.c16
-rw-r--r--net/8021q/vlan.c13
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c3
-rw-r--r--net/9p/trans_rdma.c2
-rw-r--r--net/Kconfig13
-rw-r--r--net/Makefile1
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/br2684.c6
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/lec.c31
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/mpoa_caches.c43
-rw-r--r--net/ax25/af_ax25.c2
-rw-r--r--net/ax25/ax25_addr.c2
-rw-r--r--net/ax25/ax25_dev.c2
-rw-r--r--net/ax25/ax25_ds_in.c2
-rw-r--r--net/ax25/ax25_ds_subr.c2
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_iface.c2
-rw-r--r--net/ax25/ax25_in.c2
-rw-r--r--net/ax25/ax25_ip.c2
-rw-r--r--net/ax25/ax25_out.c2
-rw-r--r--net/ax25/ax25_route.c2
-rw-r--r--net/ax25/ax25_std_in.c2
-rw-r--r--net/ax25/ax25_std_subr.c2
-rw-r--r--net/ax25/ax25_std_timer.c2
-rw-r--r--net/ax25/ax25_subr.c4
-rw-r--r--net/ax25/ax25_timer.c2
-rw-r--r--net/ax25/ax25_uid.c2
-rw-r--r--net/batman-adv/Kconfig17
-rw-r--r--net/batman-adv/Makefile4
-rw-r--r--net/batman-adv/bat_algo.c70
-rw-r--r--net/batman-adv/bat_algo.h3
-rw-r--r--net/batman-adv/bat_iv_ogm.c887
-rw-r--r--net/batman-adv/bat_v.c734
-rw-r--r--net/batman-adv/bat_v_elp.c71
-rw-r--r--net/batman-adv/bat_v_ogm.c80
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c348
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h17
-rw-r--r--net/batman-adv/debugfs.c44
-rw-r--r--net/batman-adv/debugfs.h4
-rw-r--r--net/batman-adv/distributed-arp-table.c88
-rw-r--r--net/batman-adv/fragmentation.c82
-rw-r--r--net/batman-adv/fragmentation.h2
-rw-r--r--net/batman-adv/gateway_client.c294
-rw-r--r--net/batman-adv/gateway_client.h7
-rw-r--r--net/batman-adv/gateway_common.c5
-rw-r--r--net/batman-adv/hard-interface.c299
-rw-r--r--net/batman-adv/hard-interface.h21
-rw-r--r--net/batman-adv/hash.h30
-rw-r--r--net/batman-adv/icmp_socket.c5
-rw-r--r--net/batman-adv/icmp_socket.h18
-rw-r--r--net/batman-adv/log.c4
-rw-r--r--net/batman-adv/log.h14
-rw-r--r--net/batman-adv/main.c35
-rw-r--r--net/batman-adv/main.h28
-rw-r--r--net/batman-adv/multicast.c74
-rw-r--r--net/batman-adv/multicast.h6
-rw-r--r--net/batman-adv/netlink.c248
-rw-r--r--net/batman-adv/netlink.h6
-rw-r--r--net/batman-adv/network-coding.c54
-rw-r--r--net/batman-adv/originator.c197
-rw-r--r--net/batman-adv/originator.h4
-rw-r--r--net/batman-adv/packet.h48
-rw-r--r--net/batman-adv/routing.c223
-rw-r--r--net/batman-adv/send.c553
-rw-r--r--net/batman-adv/send.h17
-rw-r--r--net/batman-adv/soft-interface.c78
-rw-r--r--net/batman-adv/sysfs.c234
-rw-r--r--net/batman-adv/tp_meter.c7
-rw-r--r--net/batman-adv/translation-table.c599
-rw-r--r--net/batman-adv/translation-table.h7
-rw-r--r--net/batman-adv/tvlv.c14
-rw-r--r--net/batman-adv/types.h114
-rw-r--r--net/bluetooth/6lowpan.c6
-rw-r--r--net/bluetooth/Makefile2
-rw-r--r--net/bluetooth/a2mp.c4
-rw-r--r--net/bluetooth/af_bluetooth.c15
-rw-r--r--net/bluetooth/amp.c4
-rw-r--r--net/bluetooth/bnep/netdev.c3
-rw-r--r--net/bluetooth/hci_conn.c26
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_request.c92
-rw-r--r--net/bluetooth/hci_request.h30
-rw-r--r--net/bluetooth/hci_sock.c396
-rw-r--r--net/bluetooth/l2cap_core.c12
-rw-r--r--net/bluetooth/leds.c27
-rw-r--r--net/bluetooth/leds.h10
-rw-r--r--net/bluetooth/mgmt.c340
-rw-r--r--net/bluetooth/mgmt_util.c66
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/bluetooth/smp.c90
-rw-r--r--net/bluetooth/smp.h1
-rw-r--r--net/bridge/Makefile2
-rw-r--r--net/bridge/br.c6
-rw-r--r--net/bridge/br_device.c15
-rw-r--r--net/bridge/br_fdb.c33
-rw-r--r--net/bridge/br_forward.c10
-rw-r--r--net/bridge/br_if.c12
-rw-r--r--net/bridge/br_input.c42
-rw-r--r--net/bridge/br_ioctl.c2
-rw-r--r--net/bridge/br_multicast.c211
-rw-r--r--net/bridge/br_netfilter_hooks.c67
-rw-r--r--net/bridge/br_netfilter_ipv6.c14
-rw-r--r--net/bridge/br_netlink.c199
-rw-r--r--net/bridge/br_private.h57
-rw-r--r--net/bridge/br_private_stp.h1
-rw-r--r--net/bridge/br_stp.c65
-rw-r--r--net/bridge/br_stp_if.c57
-rw-r--r--net/bridge/br_stp_timer.c2
-rw-r--r--net/bridge/br_switchdev.c57
-rw-r--r--net/bridge/br_sysfs_br.c41
-rw-r--r--net/bridge/br_sysfs_if.c2
-rw-r--r--net/bridge/netfilter/Kconfig1
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c3
-rw-r--r--net/bridge/netfilter/ebt_log.c13
-rw-r--r--net/bridge/netfilter/ebt_nflog.c6
-rw-r--r--net/bridge/netfilter/ebt_redirect.c8
-rw-r--r--net/bridge/netfilter/ebtable_broute.c2
-rw-r--r--net/bridge/netfilter/ebtables.c10
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c20
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c92
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c2
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c74
-rw-r--r--net/caif/caif_dev.c2
-rw-r--r--net/caif/caif_socket.c5
-rw-r--r--net/caif/cfcnfg.c9
-rw-r--r--net/can/af_can.c12
-rw-r--r--net/can/af_can.h3
-rw-r--r--net/can/bcm.c109
-rw-r--r--net/can/gw.c4
-rw-r--r--net/can/raw.c7
-rw-r--r--net/ceph/Makefile1
-rw-r--r--net/ceph/auth.c17
-rw-r--r--net/ceph/auth_none.c2
-rw-r--r--net/ceph/auth_x.c197
-rw-r--r--net/ceph/auth_x.h3
-rw-r--r--net/ceph/ceph_common.c13
-rw-r--r--net/ceph/ceph_fs.c3
-rw-r--r--net/ceph/ceph_strings.c1
-rw-r--r--net/ceph/cls_lock_client.c325
-rw-r--r--net/ceph/crush/mapper.c19
-rw-r--r--net/ceph/crypto.c463
-rw-r--r--net/ceph/crypto.h26
-rw-r--r--net/ceph/messenger.c23
-rw-r--r--net/ceph/mon_client.c94
-rw-r--r--net/ceph/osd_client.c219
-rw-r--r--net/ceph/pagevec.c2
-rw-r--r--net/compat.c2
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c78
-rw-r--r--net/core/dev.c1063
-rw-r--r--net/core/devlink.c100
-rw-r--r--net/core/drop_monitor.c62
-rw-r--r--net/core/ethtool.c109
-rw-r--r--net/core/fib_rules.c78
-rw-r--r--net/core/filter.c850
-rw-r--r--net/core/flow.c66
-rw-r--r--net/core/flow_dissector.c207
-rw-r--r--net/core/gen_estimator.c296
-rw-r--r--net/core/gen_stats.c20
-rw-r--r--net/core/lwt_bpf.c397
-rw-r--r--net/core/lwtunnel.c118
-rw-r--r--net/core/neighbour.c22
-rw-r--r--net/core/net-sysfs.c65
-rw-r--r--net/core/net_namespace.c97
-rw-r--r--net/core/netpoll.c6
-rw-r--r--net/core/pktgen.c40
-rw-r--r--net/core/rtnetlink.c352
-rw-r--r--net/core/scm.c2
-rw-r--r--net/core/secure_seq.c11
-rw-r--r--net/core/skbuff.c232
-rw-r--r--net/core/sock.c134
-rw-r--r--net/core/sock_reuseport.c1
-rw-r--r--net/core/stream.c29
-rw-r--r--net/core/sysctl_net_core.c5
-rw-r--r--net/core/utils.c2
-rw-r--r--net/dcb/dcbnl.c1
-rw-r--r--net/dccp/input.c3
-rw-r--r--net/dccp/ipv4.c36
-rw-r--r--net/dccp/ipv6.c25
-rw-r--r--net/dccp/proto.c4
-rw-r--r--net/decnet/af_decnet.c16
-rw-r--r--net/decnet/dn_dev.c4
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/decnet/dn_table.c2
-rw-r--r--net/decnet/sysctl_net_decnet.c2
-rw-r--r--net/dsa/Kconfig3
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/dsa.c102
-rw-r--r--net/dsa/dsa2.c42
-rw-r--r--net/dsa/dsa_priv.h2
-rw-r--r--net/dsa/slave.c265
-rw-r--r--net/dsa/tag_qca.c138
-rw-r--r--net/ethernet/eth.c11
-rw-r--r--net/hsr/hsr_device.c1
-rw-r--r--net/hsr/hsr_forward.c4
-rw-r--r--net/hsr/hsr_netlink.c23
-rw-r--r--net/ieee802154/6lowpan/6lowpan_i.h2
-rw-r--r--net/ieee802154/Makefile2
-rw-r--r--net/ieee802154/netlink.c24
-rw-r--r--net/ieee802154/nl-phy.c6
-rw-r--r--net/ieee802154/nl802154.c44
-rw-r--r--net/ipv4/Kconfig27
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c64
-rw-r--r--net/ipv4/arp.c12
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c71
-rw-r--r--net/ipv4/fib_rules.c15
-rw-r--r--net/ipv4/fib_semantics.c26
-rw-r--r--net/ipv4/fib_trie.c351
-rw-r--r--net/ipv4/fou.c29
-rw-r--r--net/ipv4/gre_offload.c8
-rw-r--r--net/ipv4/icmp.c12
-rw-r--r--net/ipv4/igmp.c68
-rw-r--r--net/ipv4/inet_connection_sock.c20
-rw-r--r--net/ipv4/inet_diag.c178
-rw-r--r--net/ipv4/inet_hashtables.c8
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_gre.c29
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c85
-rw-r--r--net/ipv4/ip_sockglue.c69
-rw-r--r--net/ipv4/ip_tunnel.c86
-rw-r--r--net/ipv4/ip_tunnel_core.c15
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c73
-rw-r--r--net/ipv4/ipip.c39
-rw-r--r--net/ipv4/ipmr.c19
-rw-r--r--net/ipv4/netfilter.c5
-rw-r--r--net/ipv4/netfilter/Kconfig21
-rw-r--r--net/ipv4/netfilter/Makefile8
-rw-r--r--net/ipv4/netfilter/arp_tables.c48
-rw-r--r--net/ipv4/netfilter/ip_tables.c46
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c43
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c11
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c4
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c8
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c18
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c217
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c492
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c41
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c41
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c7
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c13
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c13
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c163
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c7
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c236
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c15
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c14
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c4
-rw-r--r--net/ipv4/ping.c34
-rw-r--r--net/ipv4/proc.c103
-rw-r--r--net/ipv4/raw.c43
-rw-r--r--net/ipv4/raw_diag.c266
-rw-r--r--net/ipv4/route.c153
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c24
-rw-r--r--net/ipv4/tcp.c215
-rw-r--r--net/ipv4/tcp_bbr.c926
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c18
-rw-r--r--net/ipv4/tcp_dctcp.c14
-rw-r--r--net/ipv4/tcp_fastopen.c3
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c10
-rw-r--r--net/ipv4/tcp_input.c588
-rw-r--r--net/ipv4/tcp_ipv4.c93
-rw-r--r--net/ipv4/tcp_lp.c1
-rw-r--r--net/ipv4/tcp_metrics.c26
-rw-r--r--net/ipv4/tcp_minisocks.c10
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_output.c296
-rw-r--r--net/ipv4/tcp_probe.c4
-rw-r--r--net/ipv4/tcp_rate.c186
-rw-r--r--net/ipv4/tcp_scalable.c15
-rw-r--r--net/ipv4/tcp_timer.c8
-rw-r--r--net/ipv4/tcp_vegas.c1
-rw-r--r--net/ipv4/tcp_veno.c10
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c10
-rw-r--r--net/ipv4/udp.c298
-rw-r--r--net/ipv4/udp_diag.c89
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udp_offload.c8
-rw-r--r--net/ipv4/udplite.c4
-rw-r--r--net/ipv4/xfrm4_policy.c2
-rw-r--r--net/ipv6/Kconfig35
-rw-r--r--net/ipv6/Makefile4
-rw-r--r--net/ipv6/addrconf.c287
-rw-r--r--net/ipv6/af_inet6.c22
-rw-r--r--net/ipv6/ah6.c5
-rw-r--r--net/ipv6/datagram.c28
-rw-r--r--net/ipv6/esp6.c7
-rw-r--r--net/ipv6/exthdrs.c245
-rw-r--r--net/ipv6/fib6_rules.c3
-rw-r--r--net/ipv6/icmp.c15
-rw-r--r--net/ipv6/ila/ila_common.c1
-rw-r--r--net/ipv6/ila/ila_lwt.c95
-rw-r--r--net/ipv6/ila/ila_xlat.c45
-rw-r--r--net/ipv6/inet6_connection_sock.c11
-rw-r--r--net/ipv6/inet6_hashtables.c13
-rw-r--r--net/ipv6/ip6_fib.c6
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_gre.c63
-rw-r--r--net/ipv6/ip6_offload.c10
-rw-r--r--net/ipv6/ip6_output.c58
-rw-r--r--net/ipv6/ip6_tunnel.c255
-rw-r--r--net/ipv6/ip6_udp_tunnel.c3
-rw-r--r--net/ipv6/ip6_vti.c71
-rw-r--r--net/ipv6/ip6mr.c4
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c23
-rw-r--r--net/ipv6/mcast.c77
-rw-r--r--net/ipv6/mip6.c2
-rw-r--r--net/ipv6/ndisc.c40
-rw-r--r--net/ipv6/netfilter.c1
-rw-r--r--net/ipv6/netfilter/Kconfig14
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c47
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c2
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c23
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c8
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c11
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c148
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c44
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c21
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c151
-rw-r--r--net/ipv6/netfilter/nf_tables_ipv6.c9
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c270
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c14
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c14
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c6
-rw-r--r--net/ipv6/output_core.c9
-rw-r--r--net/ipv6/ping.c3
-rw-r--r--net/ipv6/proc.c30
-rw-r--r--net/ipv6/raw.c23
-rw-r--r--net/ipv6/reassembly.c10
-rw-r--r--net/ipv6/route.c169
-rw-r--r--net/ipv6/seg6.c497
-rw-r--r--net/ipv6/seg6_hmac.c484
-rw-r--r--net/ipv6/seg6_iptunnel.c436
-rw-r--r--net/ipv6/sit.c31
-rw-r--r--net/ipv6/syncookies.c2
-rw-r--r--net/ipv6/tcp_ipv6.c78
-rw-r--r--net/ipv6/udp.c76
-rw-r--r--net/ipv6/udp_impl.h4
-rw-r--r--net/ipv6/udplite.c4
-rw-r--r--net/ipv6/xfrm6_policy.c2
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/ipx/af_ipx.c4
-rw-r--r--net/irda/af_irda.c5
-rw-r--r--net/irda/ircomm/ircomm_tty.c2
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c2
-rw-r--r--net/irda/irda_device.c2
-rw-r--r--net/irda/irlan/irlan_eth.c4
-rw-r--r--net/irda/irnet/irnet.h3
-rw-r--r--net/irda/irnet/irnet_ppp.h11
-rw-r--r--net/irda/irnetlink.c22
-rw-r--r--net/irda/irproc.c1
-rw-r--r--net/irda/irqueue.c34
-rw-r--r--net/iucv/af_iucv.c59
-rw-r--r--net/iucv/iucv.c124
-rw-r--r--net/kcm/Kconfig1
-rw-r--r--net/kcm/kcmproc.c58
-rw-r--r--net/kcm/kcmsock.c545
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/l2tp/l2tp_core.c2
-rw-r--r--net/l2tp/l2tp_core.h13
-rw-r--r--net/l2tp/l2tp_eth.c8
-rw-r--r--net/l2tp/l2tp_ip.c108
-rw-r--r--net/l2tp/l2tp_ip6.c105
-rw-r--r--net/l2tp/l2tp_netlink.c61
-rw-r--r--net/l2tp/l2tp_ppp.c84
-rw-r--r--net/l3mdev/l3mdev.c105
-rw-r--r--net/lapb/lapb_iface.c2
-rw-r--r--net/lapb/lapb_in.c2
-rw-r--r--net/lapb/lapb_out.c2
-rw-r--r--net/lapb/lapb_subr.c2
-rw-r--r--net/lapb/lapb_timer.c2
-rw-r--r--net/llc/af_llc.c28
-rw-r--r--net/llc/llc_conn.c3
-rw-r--r--net/llc/llc_sap.c3
-rw-r--r--net/mac80211/Makefile3
-rw-r--r--net/mac80211/aes_ccm.c46
-rw-r--r--net/mac80211/aes_ccm.h8
-rw-r--r--net/mac80211/aes_cmac.c8
-rw-r--r--net/mac80211/aes_cmac.h4
-rw-r--r--net/mac80211/aes_gcm.c43
-rw-r--r--net/mac80211/aes_gcm.h6
-rw-r--r--net/mac80211/aes_gmac.c26
-rw-r--r--net/mac80211/aes_gmac.h4
-rw-r--r--net/mac80211/agg-rx.c19
-rw-r--r--net/mac80211/cfg.c278
-rw-r--r--net/mac80211/chan.c9
-rw-r--r--net/mac80211/debugfs.c161
-rw-r--r--net/mac80211/debugfs_netdev.c60
-rw-r--r--net/mac80211/debugfs_sta.c65
-rw-r--r--net/mac80211/driver-ops.c17
-rw-r--r--net/mac80211/driver-ops.h109
-rw-r--r--net/mac80211/fils_aead.c342
-rw-r--r--net/mac80211/fils_aead.h19
-rw-r--r--net/mac80211/ieee80211_i.h69
-rw-r--r--net/mac80211/iface.c101
-rw-r--r--net/mac80211/key.c3
-rw-r--r--net/mac80211/main.c29
-rw-r--r--net/mac80211/mesh.c2
-rw-r--r--net/mac80211/mesh_hwmp.c27
-rw-r--r--net/mac80211/mesh_sync.c12
-rw-r--r--net/mac80211/mlme.c93
-rw-r--r--net/mac80211/offchannel.c6
-rw-r--r--net/mac80211/pm.c3
-rw-r--r--net/mac80211/rx.c183
-rw-r--r--net/mac80211/scan.c2
-rw-r--r--net/mac80211/sta_info.c127
-rw-r--r--net/mac80211/sta_info.h28
-rw-r--r--net/mac80211/status.c15
-rw-r--r--net/mac80211/trace.h159
-rw-r--r--net/mac80211/tx.c526
-rw-r--r--net/mac80211/util.c125
-rw-r--r--net/mac80211/vht.c20
-rw-r--r--net/mac80211/wme.c23
-rw-r--r--net/mac80211/wpa.c24
-rw-r--r--net/mac802154/Makefile2
-rw-r--r--net/mac802154/iface.c1
-rw-r--r--net/mac802154/rx.c9
-rw-r--r--net/mac802154/util.c4
-rw-r--r--net/mpls/af_mpls.c55
-rw-r--r--net/mpls/internal.h10
-rw-r--r--net/mpls/mpls_gso.c40
-rw-r--r--net/mpls/mpls_iptunnel.c19
-rw-r--r--net/ncsi/internal.h24
-rw-r--r--net/ncsi/ncsi-aen.c55
-rw-r--r--net/ncsi/ncsi-cmd.c2
-rw-r--r--net/ncsi/ncsi-manage.c320
-rw-r--r--net/ncsi/ncsi-rsp.c4
-rw-r--r--net/netfilter/Kconfig80
-rw-r--r--net/netfilter/Makefile34
-rw-r--r--net/netfilter/core.c223
-rw-r--r--net/netfilter/ipset/Kconfig9
-rw-r--r--net/netfilter/ipset/Makefile1
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h31
-rw-r--r--net/netfilter/ipset/ip_set_core.c22
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h254
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c315
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c37
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c27
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c54
-rw-r--r--net/netfilter/nf_conntrack_core.c297
-rw-r--r--net/netfilter/nf_conntrack_ecache.c22
-rw-r--r--net/netfilter/nf_conntrack_ftp.c17
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c2
-rw-r--r--net/netfilter/nf_conntrack_helper.c28
-rw-r--r--net/netfilter/nf_conntrack_netlink.c50
-rw-r--r--net/netfilter/nf_conntrack_pptp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto.c233
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c104
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c39
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c27
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c189
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c131
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c53
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c106
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c20
-rw-r--r--net/netfilter/nf_conntrack_sip.c15
-rw-r--r--net/netfilter/nf_conntrack_standalone.c26
-rw-r--r--net/netfilter/nf_dup_netdev.c35
-rw-r--r--net/netfilter/nf_internals.h11
-rw-r--r--net/netfilter/nf_log.c15
-rw-r--r--net/netfilter/nf_log_common.c32
-rw-r--r--net/netfilter/nf_log_netdev.c81
-rw-r--r--net/netfilter/nf_nat_core.c67
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c36
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c40
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c35
-rw-r--r--net/netfilter/nf_queue.c80
-rw-r--r--net/netfilter/nf_synproxy_core.c2
-rw-r--r--net/netfilter/nf_tables_api.c1021
-rw-r--r--net/netfilter/nf_tables_core.c93
-rw-r--r--net/netfilter/nf_tables_inet.c5
-rw-r--r--net/netfilter/nf_tables_netdev.c101
-rw-r--r--net/netfilter/nf_tables_trace.c28
-rw-r--r--net/netfilter/nfnetlink.c2
-rw-r--r--net/netfilter/nfnetlink_cthelper.c2
-rw-r--r--net/netfilter/nfnetlink_log.c15
-rw-r--r--net/netfilter/nfnetlink_queue.c27
-rw-r--r--net/netfilter/nft_bitwise.c21
-rw-r--r--net/netfilter/nft_byteorder.c28
-rw-r--r--net/netfilter/nft_cmp.c13
-rw-r--r--net/netfilter/nft_counter.c223
-rw-r--r--net/netfilter/nft_ct.c97
-rw-r--r--net/netfilter/nft_dynset.c61
-rw-r--r--net/netfilter/nft_exthdr.c11
-rw-r--r--net/netfilter/nft_fib.c159
-rw-r--r--net/netfilter/nft_fib_inet.c82
-rw-r--r--net/netfilter/nft_fwd_netdev.c4
-rw-r--r--net/netfilter/nft_hash.c442
-rw-r--r--net/netfilter/nft_immediate.c14
-rw-r--r--net/netfilter/nft_limit.c4
-rw-r--r--net/netfilter/nft_log.c17
-rw-r--r--net/netfilter/nft_lookup.c23
-rw-r--r--net/netfilter/nft_masq.c6
-rw-r--r--net/netfilter/nft_meta.c13
-rw-r--r--net/netfilter/nft_nat.c11
-rw-r--r--net/netfilter/nft_numgen.c212
-rw-r--r--net/netfilter/nft_objref.c228
-rw-r--r--net/netfilter/nft_payload.c149
-rw-r--r--net/netfilter/nft_queue.c117
-rw-r--r--net/netfilter/nft_quota.c237
-rw-r--r--net/netfilter/nft_range.c145
-rw-r--r--net/netfilter/nft_redir.c6
-rw-r--r--net/netfilter/nft_reject_inet.c18
-rw-r--r--net/netfilter/nft_rt.c153
-rw-r--r--net/netfilter/nft_set_hash.c424
-rw-r--r--net/netfilter/nft_set_rbtree.c (renamed from net/netfilter/nft_rbtree.c)28
-rw-r--r--net/netfilter/x_tables.c62
-rw-r--r--net/netfilter/xt_AUDIT.c10
-rw-r--r--net/netfilter/xt_CONNSECMARK.c4
-rw-r--r--net/netfilter/xt_CT.c6
-rw-r--r--net/netfilter/xt_LOG.c6
-rw-r--r--net/netfilter/xt_NETMAP.c31
-rw-r--r--net/netfilter/xt_NFLOG.c7
-rw-r--r--net/netfilter/xt_NFQUEUE.c4
-rw-r--r--net/netfilter/xt_RATEEST.c10
-rw-r--r--net/netfilter/xt_REDIRECT.c16
-rw-r--r--net/netfilter/xt_TCPMSS.c16
-rw-r--r--net/netfilter/xt_TEE.c12
-rw-r--r--net/netfilter/xt_TPROXY.c31
-rw-r--r--net/netfilter/xt_addrtype.c10
-rw-r--r--net/netfilter/xt_bpf.c96
-rw-r--r--net/netfilter/xt_cluster.c2
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlabel.c6
-rw-r--r--net/netfilter/xt_connlimit.c22
-rw-r--r--net/netfilter/xt_connmark.c12
-rw-r--r--net/netfilter/xt_conntrack.c16
-rw-r--r--net/netfilter/xt_devgroup.c4
-rw-r--r--net/netfilter/xt_dscp.c2
-rw-r--r--net/netfilter/xt_hashlimit.c342
-rw-r--r--net/netfilter/xt_helper.c8
-rw-r--r--net/netfilter/xt_ipcomp.c2
-rw-r--r--net/netfilter/xt_ipvs.c4
-rw-r--r--net/netfilter/xt_multiport.c52
-rw-r--r--net/netfilter/xt_nat.c18
-rw-r--r--net/netfilter/xt_nfacct.c2
-rw-r--r--net/netfilter/xt_osf.c10
-rw-r--r--net/netfilter/xt_owner.c2
-rw-r--r--net/netfilter/xt_physdev.c4
-rw-r--r--net/netfilter/xt_pkttype.c4
-rw-r--r--net/netfilter/xt_policy.c4
-rw-r--r--net/netfilter/xt_rateest.c28
-rw-r--r--net/netfilter/xt_recent.c19
-rw-r--r--net/netfilter/xt_sctp.c2
-rw-r--r--net/netfilter/xt_set.c38
-rw-r--r--net/netfilter/xt_socket.c336
-rw-r--r--net/netfilter/xt_state.c4
-rw-r--r--net/netfilter/xt_time.c2
-rw-r--r--net/netlabel/netlabel_calipso.c21
-rw-r--r--net/netlabel/netlabel_cipso_v4.c22
-rw-r--r--net/netlabel/netlabel_kapi.c5
-rw-r--r--net/netlabel/netlabel_mgmt.c21
-rw-r--r--net/netlabel/netlabel_unlabeled.c21
-rw-r--r--net/netlink/af_netlink.c38
-rw-r--r--net/netlink/af_netlink.h2
-rw-r--r--net/netlink/diag.c107
-rw-r--r--net/netlink/genetlink.c327
-rw-r--r--net/nfc/netlink.c34
-rw-r--r--net/openvswitch/actions.c202
-rw-r--r--net/openvswitch/conntrack.c21
-rw-r--r--net/openvswitch/datapath.c59
-rw-r--r--net/openvswitch/datapath.h2
-rw-r--r--net/openvswitch/flow.c209
-rw-r--r--net/openvswitch/flow.h34
-rw-r--r--net/openvswitch/flow_netlink.c464
-rw-r--r--net/openvswitch/flow_netlink.h3
-rw-r--r--net/openvswitch/flow_table.c25
-rw-r--r--net/openvswitch/vport-internal_dev.c12
-rw-r--r--net/openvswitch/vport-netdev.c10
-rw-r--r--net/openvswitch/vport.c56
-rw-r--r--net/openvswitch/vport.h3
-rw-r--r--net/packet/af_packet.c181
-rw-r--r--net/phonet/pep-gprs.c12
-rw-r--r--net/phonet/pep.c9
-rw-r--r--net/phonet/pn_dev.c2
-rw-r--r--net/qrtr/qrtr.c4
-rw-r--r--net/rds/Makefile2
-rw-r--r--net/rds/af_rds.c4
-rw-r--r--net/rds/connection.c18
-rw-r--r--net/rds/ib.c2
-rw-r--r--net/rds/ib.h1
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rdma.c2
-rw-r--r--net/rds/rdma_transport.c5
-rw-r--r--net/rds/rds.h15
-rw-r--r--net/rds/recv.c36
-rw-r--r--net/rds/send.c9
-rw-r--r--net/rds/tcp.c24
-rw-r--r--net/rds/tcp_connect.c14
-rw-r--r--net/rds/tcp_listen.c31
-rw-r--r--net/rds/tcp_send.c3
-rw-r--r--net/rds/threads.c3
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/rose/rose_route.c2
-rw-r--r--net/rxrpc/Kconfig14
-rw-r--r--net/rxrpc/Makefile1
-rw-r--r--net/rxrpc/af_rxrpc.c176
-rw-r--r--net/rxrpc/ar-internal.h840
-rw-r--r--net/rxrpc/call_accept.c717
-rw-r--r--net/rxrpc/call_event.c1447
-rw-r--r--net/rxrpc/call_object.c797
-rw-r--r--net/rxrpc/conn_client.c993
-rw-r--r--net/rxrpc/conn_event.c271
-rw-r--r--net/rxrpc/conn_object.c204
-rw-r--r--net/rxrpc/conn_service.c117
-rw-r--r--net/rxrpc/input.c1438
-rw-r--r--net/rxrpc/insecure.c26
-rw-r--r--net/rxrpc/local_event.c19
-rw-r--r--net/rxrpc/local_object.c51
-rw-r--r--net/rxrpc/misc.c194
-rw-r--r--net/rxrpc/output.c956
-rw-r--r--net/rxrpc/peer_event.c103
-rw-r--r--net/rxrpc/peer_object.c199
-rw-r--r--net/rxrpc/proc.c72
-rw-r--r--net/rxrpc/recvmsg.c870
-rw-r--r--net/rxrpc/rxkad.c209
-rw-r--r--net/rxrpc/security.c18
-rw-r--r--net/rxrpc/sendmsg.c610
-rw-r--r--net/rxrpc/skbuff.c174
-rw-r--r--net/rxrpc/sysctl.c45
-rw-r--r--net/rxrpc/utils.c2
-rw-r--r--net/sched/Kconfig27
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c68
-rw-r--r--net/sched/act_bpf.c24
-rw-r--r--net/sched/act_connmark.c2
-rw-r--r--net/sched/act_csum.c38
-rw-r--r--net/sched/act_gact.c5
-rw-r--r--net/sched/act_ife.c28
-rw-r--r--net/sched/act_ipt.c16
-rw-r--r--net/sched/act_meta_skbtcindex.c79
-rw-r--r--net/sched/act_mirred.c99
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c26
-rw-r--r--net/sched/act_police.c35
-rw-r--r--net/sched/act_simple.c2
-rw-r--r--net/sched/act_skbedit.c23
-rw-r--r--net/sched/act_skbmod.c301
-rw-r--r--net/sched/act_tunnel_key.c349
-rw-r--r--net/sched/act_vlan.c53
-rw-r--r--net/sched/cls_api.c64
-rw-r--r--net/sched/cls_basic.c16
-rw-r--r--net/sched/cls_bpf.c202
-rw-r--r--net/sched/cls_cgroup.c20
-rw-r--r--net/sched/cls_flow.c54
-rw-r--r--net/sched/cls_flower.c576
-rw-r--r--net/sched/cls_fw.c28
-rw-r--r--net/sched/cls_matchall.c128
-rw-r--r--net/sched/cls_route.c24
-rw-r--r--net/sched/cls_rsvp.h20
-rw-r--r--net/sched/cls_tcindex.c103
-rw-r--r--net/sched/cls_u32.c51
-rw-r--r--net/sched/em_ipset.c17
-rw-r--r--net/sched/em_meta.c9
-rw-r--r--net/sched/sch_api.c97
-rw-r--r--net/sched/sch_cbq.c8
-rw-r--r--net/sched/sch_codel.c4
-rw-r--r--net/sched/sch_drr.c6
-rw-r--r--net/sched/sch_fifo.c4
-rw-r--r--net/sched/sch_fq.c89
-rw-r--r--net/sched/sch_generic.c38
-rw-r--r--net/sched/sch_hfsc.c57
-rw-r--r--net/sched/sch_htb.c30
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_mqprio.c2
-rw-r--r--net/sched/sch_netem.c24
-rw-r--r--net/sched/sch_pie.c4
-rw-r--r--net/sched/sch_qfq.c8
-rw-r--r--net/sched/sch_teql.c5
-rw-r--r--net/sctp/associola.c14
-rw-r--r--net/sctp/auth.c2
-rw-r--r--net/sctp/bind_addr.c3
-rw-r--r--net/sctp/chunk.c58
-rw-r--r--net/sctp/endpointola.c5
-rw-r--r--net/sctp/input.c136
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/ipv6.c7
-rw-r--r--net/sctp/offload.c2
-rw-r--r--net/sctp/output.c451
-rw-r--r--net/sctp/outqueue.c105
-rw-r--r--net/sctp/proc.c10
-rw-r--r--net/sctp/protocol.c40
-rw-r--r--net/sctp/sctp_diag.c20
-rw-r--r--net/sctp/sm_make_chunk.c28
-rw-r--r--net/sctp/sm_sideeffect.c25
-rw-r--r--net/sctp/sm_statefuns.c16
-rw-r--r--net/sctp/socket.c63
-rw-r--r--net/sctp/transport.c6
-rw-r--r--net/sctp/ulpevent.c4
-rw-r--r--net/sctp/ulpqueue.c3
-rw-r--r--net/socket.c118
-rw-r--r--net/strparser/Kconfig4
-rw-r--r--net/strparser/Makefile1
-rw-r--r--net/strparser/strparser.c510
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c29
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c94
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c3
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c4
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c27
-rw-r--r--net/sunrpc/auth_unix.c13
-rw-r--r--net/sunrpc/backchannel_rqst.c8
-rw-r--r--net/sunrpc/cache.c9
-rw-r--r--net/sunrpc/clnt.c142
-rw-r--r--net/sunrpc/netns.h2
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/sched.c35
-rw-r--r--net/sunrpc/stats.c10
-rw-r--r--net/sunrpc/sunrpc_syms.c3
-rw-r--r--net/sunrpc/svc.c31
-rw-r--r--net/sunrpc/svc_xprt.c25
-rw-r--r--net/sunrpc/svcauth.c18
-rw-r--r--net/sunrpc/svcauth_unix.c6
-rw-r--r--net/sunrpc/svcsock.c47
-rw-r--r--net/sunrpc/sysctl.c2
-rw-r--r--net/sunrpc/xdr.c11
-rw-r--r--net/sunrpc/xprt.c5
-rw-r--r--net/sunrpc/xprtmultipath.c24
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c55
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c145
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c359
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c36
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c25
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c156
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c156
-rw-r--r--net/sunrpc/xprtrdma/transport.c234
-rw-r--r--net/sunrpc/xprtrdma/verbs.c268
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h140
-rw-r--r--net/sunrpc/xprtsock.c39
-rw-r--r--net/switchdev/switchdev.c292
-rw-r--r--net/sysctl_net.c33
-rw-r--r--net/tipc/bcast.c22
-rw-r--r--net/tipc/bcast.h7
-rw-r--r--net/tipc/bearer.c141
-rw-r--r--net/tipc/bearer.h15
-rw-r--r--net/tipc/core.c2
-rw-r--r--net/tipc/core.h2
-rw-r--r--net/tipc/discover.c4
-rw-r--r--net/tipc/link.c193
-rw-r--r--net/tipc/link.h6
-rw-r--r--net/tipc/monitor.c10
-rw-r--r--net/tipc/msg.c20
-rw-r--r--net/tipc/msg.h31
-rw-r--r--net/tipc/name_distr.c3
-rw-r--r--net/tipc/net.h2
-rw-r--r--net/tipc/netlink.c45
-rw-r--r--net/tipc/netlink_compat.c25
-rw-r--r--net/tipc/node.c106
-rw-r--r--net/tipc/node.h12
-rw-r--r--net/tipc/server.c48
-rw-r--r--net/tipc/socket.c550
-rw-r--r--net/tipc/subscr.c124
-rw-r--r--net/tipc/subscr.h1
-rw-r--r--net/tipc/udp_media.c529
-rw-r--r--net/tipc/udp_media.h46
-rw-r--r--net/unix/af_unix.c76
-rw-r--r--net/vmw_vsock/virtio_transport.c58
-rw-r--r--net/vmw_vsock/virtio_transport_common.c27
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c30
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c30
-rw-r--r--net/wimax/stack.c22
-rw-r--r--net/wireless/Makefile2
-rw-r--r--net/wireless/chan.c2
-rw-r--r--net/wireless/core.c72
-rw-r--r--net/wireless/core.h15
-rw-r--r--net/wireless/ibss.c14
-rw-r--r--net/wireless/lib80211_crypt_tkip.c2
-rw-r--r--net/wireless/mesh.c2
-rw-r--r--net/wireless/mlme.c21
-rw-r--r--net/wireless/nl80211.c1886
-rw-r--r--net/wireless/nl80211.h3
-rw-r--r--net/wireless/rdev-ops.h82
-rw-r--r--net/wireless/scan.c127
-rw-r--r--net/wireless/sme.c25
-rw-r--r--net/wireless/sysfs.c7
-rw-r--r--net/wireless/trace.h127
-rw-r--r--net/wireless/util.c203
-rw-r--r--net/wireless/wext-compat.c21
-rw-r--r--net/wireless/wext-sme.c5
-rw-r--r--net/x25/af_x25.c6
-rw-r--r--net/x25/sysctl_net_x25.c2
-rw-r--r--net/x25/x25_link.c2
-rw-r--r--net/xfrm/xfrm_algo.c2
-rw-r--r--net/xfrm/xfrm_policy.c152
-rw-r--r--net/xfrm/xfrm_proc.c10
-rw-r--r--net/xfrm/xfrm_replay.c6
-rw-r--r--net/xfrm/xfrm_state.c137
-rw-r--r--net/xfrm/xfrm_sysctl.c4
-rw-r--r--net/xfrm/xfrm_user.c4
832 files changed, 43265 insertions, 20105 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index 86450b7e2899..941df2fa4448 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
- } else {
- neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
}
write_unlock_bh(&n->lock);
}
diff --git a/net/802/fc.c b/net/802/fc.c
index 7b9219022418..1bb496ea997e 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -10,7 +10,7 @@
* v 1.0 03/22/99
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 7d3a0af954e8..6356623fc238 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -141,15 +141,6 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
EXPORT_SYMBOL(fddi_type_trans);
-int fddi_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-EXPORT_SYMBOL(fddi_change_mtu);
-
static const struct header_ops fddi_header_ops = {
.create = fddi_header,
};
@@ -161,6 +152,8 @@ static void fddi_setup(struct net_device *dev)
dev->type = ARPHRD_FDDI;
dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */
dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */
+ dev->min_mtu = FDDI_K_SNAP_HLEN;
+ dev->max_mtu = FDDI_K_SNAP_DLEN;
dev->addr_len = FDDI_K_ALEN;
dev->tx_queue_len = 100; /* Long queues on FDDI */
dev->flags = IFF_BROADCAST | IFF_MULTICAST;
diff --git a/net/802/hippi.c b/net/802/hippi.c
index ade1a52cdcff..4460606e9c36 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -34,7 +34,7 @@
#include <linux/errno.h>
#include <net/arp.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* Create the HIPPI MAC header for an arbitrary protocol layer
@@ -116,18 +116,6 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
EXPORT_SYMBOL(hippi_type_trans);
-int hippi_change_mtu(struct net_device *dev, int new_mtu)
-{
- /*
- * HIPPI's got these nice large MTUs.
- */
- if ((new_mtu < 68) || (new_mtu > 65280))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-EXPORT_SYMBOL(hippi_change_mtu);
-
/*
* For HIPPI we will actually use the lower 4 bytes of the hardware
* address as the I-FIELD rather than the actual hardware address.
@@ -174,6 +162,8 @@ static void hippi_setup(struct net_device *dev)
dev->type = ARPHRD_HIPPI;
dev->hard_header_len = HIPPI_HLEN;
dev->mtu = 65280;
+ dev->min_mtu = 68;
+ dev->max_mtu = 65280;
dev->addr_len = HIPPI_ALEN;
dev->tx_queue_len = 25 /* 5 */;
memset(dev->broadcast, 0xFF, HIPPI_ALEN);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 8de138d3306b..467069b73ce1 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -34,7 +34,7 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/if_vlan.h>
#include "vlan.h"
@@ -44,7 +44,7 @@
/* Global VLAN variables */
-int vlan_net_id __read_mostly;
+unsigned int vlan_net_id __read_mostly;
const char vlan_fullname[] = "802.1Q VLAN Support";
const char vlan_version[] = DRV_VERSION;
@@ -515,8 +515,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
return -EFAULT;
/* Null terminate this sucker, just in case. */
- args.device1[23] = 0;
- args.u.device2[23] = 0;
+ args.device1[sizeof(args.device1) - 1] = 0;
+ args.u.device2[sizeof(args.u.device2) - 1] = 0;
rtnl_lock();
@@ -571,8 +571,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
- if ((args.u.name_type >= 0) &&
- (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
+ if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) {
struct vlan_net *vn;
vn = net_generic(net, vlan_net_id);
@@ -664,7 +663,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
skb_gro_pull(skb, sizeof(*vhdr));
skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
- pp = ptype->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index cc1557978066..df8bd65dd370 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -159,7 +159,7 @@ void vlan_netlink_fini(void);
extern struct rtnl_link_ops vlan_link_ops;
-extern int vlan_net_id;
+extern unsigned int vlan_net_id;
struct proc_dir_entry;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index fbfacd51aa34..10da6c588bf8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -826,5 +826,8 @@ void vlan_setup(struct net_device *dev)
dev->destructor = vlan_dev_free;
dev->ethtool_ops = &vlan_ethtool_ops;
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
+
eth_zero_addr(dev->broadcast);
}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 1852e383afd6..553ed4ecb6a0 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -680,7 +680,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
goto error;
/* Create the Protection Domain */
- rdma->pd = ib_alloc_pd(rdma->cm_id->device);
+ rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0);
if (IS_ERR(rdma->pd))
goto error;
diff --git a/net/Kconfig b/net/Kconfig
index c2cdbce629bd..a29bb4b41c50 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -258,10 +258,6 @@ config XPS
config HWBM
bool
-config SOCK_CGROUP_DATA
- bool
- default n
-
config CGROUP_NET_PRIO
bool "Network priority cgroup"
depends on CGROUPS
@@ -369,6 +365,7 @@ source "net/irda/Kconfig"
source "net/bluetooth/Kconfig"
source "net/rxrpc/Kconfig"
source "net/kcm/Kconfig"
+source "net/strparser/Kconfig"
config FIB_RULES
bool
@@ -401,6 +398,14 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.
+config LWTUNNEL_BPF
+ bool "Execute BPF program as route nexthop action"
+ depends on LWTUNNEL
+ default y if LWTUNNEL=y
+ ---help---
+ Allows to run BPF programs as a nexthop action following a route
+ lookup for incoming and outgoing packets.
+
config DST_CACHE
bool
default n
diff --git a/net/Makefile b/net/Makefile
index 9bd20bb86cc6..4cafaa2b4667 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BT) += bluetooth/
obj-$(CONFIG_SUNRPC) += sunrpc/
obj-$(CONFIG_AF_RXRPC) += rxrpc/
obj-$(CONFIG_AF_KCM) += kcm/
+obj-$(CONFIG_STREAM_PARSER) += strparser/
obj-$(CONFIG_ATM) += atm/
obj-$(CONFIG_L2TP) += l2tp/
obj-$(CONFIG_DECNET) += decnet/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index f066781be3c8..10d2bdce686e 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1278,7 +1278,7 @@ out:
return err;
}
-#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
+#if IS_ENABLED(CONFIG_IPDDP)
static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
{
return skb->data[12] == 22;
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index aa0047c5c467..fca84e111c89 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -620,14 +620,12 @@ error:
static const struct net_device_ops br2684_netdev_ops = {
.ndo_start_xmit = br2684_start_xmit,
.ndo_set_mac_address = br2684_mac_addr,
- .ndo_change_mtu = eth_change_mtu,
.ndo_validate_addr = eth_validate_addr,
};
static const struct net_device_ops br2684_netdev_ops_routed = {
.ndo_start_xmit = br2684_start_xmit,
.ndo_set_mac_address = br2684_mac_addr,
- .ndo_change_mtu = eth_change_mtu
};
static void br2684_setup(struct net_device *netdev)
@@ -651,7 +649,9 @@ static void br2684_setup_routed(struct net_device *netdev)
netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */
netdev->netdev_ops = &br2684_netdev_ops_routed;
netdev->addr_len = 0;
- netdev->mtu = 1500;
+ netdev->mtu = ETH_DATA_LEN;
+ netdev->min_mtu = 0;
+ netdev->max_mtu = ETH_MAX_MTU;
netdev->type = ARPHRD_PPP;
netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
netdev->tx_queue_len = 100;
diff --git a/net/atm/common.c b/net/atm/common.c
index 6dc12305799e..a3ca922d307b 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -630,7 +630,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
goto out;
skb->dev = NULL; /* for paths shared with net_device interfaces */
ATM_SKB(skb)->atm_options = vcc->atm_options;
- if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) {
+ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
kfree_skb(skb);
error = -EFAULT;
goto out;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index e574a7e9db6f..09cfe87f0a44 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -31,7 +31,7 @@
#include <linux/atmlec.h>
/* Proxy LEC knows about bridging */
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
#include "../bridge/br_private.h"
static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
@@ -111,9 +111,9 @@ static inline void lec_arp_put(struct lec_arp_table *entry)
}
static struct lane2_ops lane2_ops = {
- lane2_resolve, /* resolve, spec 3.1.3 */
- lane2_associate_req, /* associate_req, spec 3.1.4 */
- NULL /* associate indicator, spec 3.1.5 */
+ .resolve = lane2_resolve, /* spec 3.1.3 */
+ .associate_req = lane2_associate_req, /* spec 3.1.4 */
+ .associate_indicator = NULL /* spec 3.1.5 */
};
static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
@@ -121,7 +121,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
/* Device structures */
static struct net_device *dev_lec[MAX_LEC_ITF];
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
{
char *buff;
@@ -155,7 +155,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
sk->sk_data_ready(sk);
}
}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
/*
* Open/initialize the netdevice. This is called (in the current kernel)
@@ -222,7 +222,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
(long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
(long)skb_end_pointer(skb));
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
lec_handle_bridge(skb, dev);
#endif
@@ -426,7 +426,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
(unsigned short)(0xffff & mesg->content.normal.flag);
break;
case l_should_bridge:
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
{
pr_debug("%s: bridge zeppelin asks about %pM\n",
dev->name, mesg->content.proxy.mac_addr);
@@ -452,7 +452,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
sk->sk_data_ready(sk);
}
}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
break;
default:
pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
@@ -544,15 +544,6 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
return 0;
}
-/* shamelessly stolen from drivers/net/net_init.c */
-static int lec_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < 68) || (new_mtu > 18190))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
static void lec_set_multicast_list(struct net_device *dev)
{
/*
@@ -565,7 +556,6 @@ static const struct net_device_ops lec_netdev_ops = {
.ndo_open = lec_open,
.ndo_stop = lec_close,
.ndo_start_xmit = lec_start_xmit,
- .ndo_change_mtu = lec_change_mtu,
.ndo_tx_timeout = lec_tx_timeout,
.ndo_set_rx_mode = lec_set_multicast_list,
};
@@ -742,6 +732,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
if (!dev_lec[i])
return -ENOMEM;
dev_lec[i]->netdev_ops = &lec_netdev_ops;
+ dev_lec[i]->max_mtu = 18190;
snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
if (register_netdev(dev_lec[i])) {
free_netdev(dev_lec[i]);
@@ -1068,7 +1059,9 @@ static void __exit lane_module_cleanup(void)
{
int i;
+#ifdef CONFIG_PROC_FS
remove_proc_entry("lec", atm_proc_root);
+#endif
deregister_atm_ioctl(&lane_ioctl_ops);
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 0e982222d425..3b3b1a292ec8 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1007,7 +1007,7 @@ static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
if (!net_eq(dev_net(dev), &init_net))
return NOTIFY_DONE;
- if (dev->name == NULL || strncmp(dev->name, "lec", 3))
+ if (strncmp(dev->name, "lec", 3))
return NOTIFY_DONE; /* we are only interested in lec:s */
switch (event) {
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 9e60e74c807d..a89fdebeffda 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -535,33 +535,32 @@ static void eg_destroy_cache(struct mpoa_client *mpc)
static const struct in_cache_ops ingress_ops = {
- in_cache_add_entry, /* add_entry */
- in_cache_get, /* get */
- in_cache_get_with_mask, /* get_with_mask */
- in_cache_get_by_vcc, /* get_by_vcc */
- in_cache_put, /* put */
- in_cache_remove_entry, /* remove_entry */
- cache_hit, /* cache_hit */
- clear_count_and_expired, /* clear_count */
- check_resolving_entries, /* check_resolving */
- refresh_entries, /* refresh */
- in_destroy_cache /* destroy_cache */
+ .add_entry = in_cache_add_entry,
+ .get = in_cache_get,
+ .get_with_mask = in_cache_get_with_mask,
+ .get_by_vcc = in_cache_get_by_vcc,
+ .put = in_cache_put,
+ .remove_entry = in_cache_remove_entry,
+ .cache_hit = cache_hit,
+ .clear_count = clear_count_and_expired,
+ .check_resolving = check_resolving_entries,
+ .refresh = refresh_entries,
+ .destroy_cache = in_destroy_cache
};
static const struct eg_cache_ops egress_ops = {
- eg_cache_add_entry, /* add_entry */
- eg_cache_get_by_cache_id, /* get_by_cache_id */
- eg_cache_get_by_tag, /* get_by_tag */
- eg_cache_get_by_vcc, /* get_by_vcc */
- eg_cache_get_by_src_ip, /* get_by_src_ip */
- eg_cache_put, /* put */
- eg_cache_remove_entry, /* remove_entry */
- update_eg_cache_entry, /* update */
- clear_expired, /* clear_expired */
- eg_destroy_cache /* destroy_cache */
+ .add_entry = eg_cache_add_entry,
+ .get_by_cache_id = eg_cache_get_by_cache_id,
+ .get_by_tag = eg_cache_get_by_tag,
+ .get_by_vcc = eg_cache_get_by_vcc,
+ .get_by_src_ip = eg_cache_get_by_src_ip,
+ .put = eg_cache_put,
+ .remove_entry = eg_cache_remove_entry,
+ .update = update_eg_cache_entry,
+ .clear_expired = clear_expired,
+ .destroy_cache = eg_destroy_cache
};
-
void atm_mpoa_init_cache(struct mpoa_client *mpc)
{
mpc->in_ops = &ingress_ops;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2fdebabbfacd..90fcf5fc2e0a 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -32,7 +32,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index e7c9b0ea17a1..ac2542b7be88 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -21,7 +21,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 3d106767b272..9a3a301e1e2f 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -23,7 +23,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 9bd31e88aeca..891596e74278 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,7 +22,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index e05bd57b5afd..28827e81ba2b 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -23,7 +23,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 5237dff6941d..5fb2104b7304 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -24,7 +24,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
index 7f16e8a931b2..8c07c28569e4 100644
--- a/net/ax25/ax25_iface.c
+++ b/net/ax25/ax25_iface.c
@@ -23,7 +23,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index bb5a0e4e98d9..860752639b1a 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 2fa3be965101..183b1c583d56 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -23,7 +23,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index 8ddd41baa81c..b11a5f466fcc 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -25,7 +25,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index d39097737e38..e1fda27cb27c 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -31,7 +31,7 @@
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 3fbf8f7b2cf4..8632b86e843e 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,7 +29,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
index 8b66a41e538f..94bd06396a43 100644
--- a/net/ax25/ax25_std_subr.c
+++ b/net/ax25/ax25_std_subr.c
@@ -20,7 +20,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 2c0d6ef66f9d..30bbc675261d 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 655a7d4c96e1..038b109b2be7 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -264,7 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
- if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY))
ax25_stop_heartbeat(ax25);
ax25_stop_t1timer(ax25);
ax25_stop_t2timer(ax25);
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
index c3cffa79bafb..23a6f38a80bf 100644
--- a/net/ax25/ax25_timer.c
+++ b/net/ax25/ax25_timer.c
@@ -28,7 +28,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 4ad2fb7bcd35..0403b0def7e6 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -25,7 +25,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 833bb145ba3c..b73b96a2854b 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -17,7 +17,7 @@ config BATMAN_ADV
config BATMAN_ADV_BATMAN_V
bool "B.A.T.M.A.N. V protocol (experimental)"
- depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m)
+ depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
default n
help
This option enables the B.A.T.M.A.N. V protocol, the successor
@@ -73,10 +73,21 @@ config BATMAN_ADV_MCAST
reduce the air overhead while improving the reliability of
multicast messages.
-config BATMAN_ADV_DEBUG
- bool "B.A.T.M.A.N. debugging"
+config BATMAN_ADV_DEBUGFS
+ bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
+ default y
+ help
+ Enable this to export routing related debug tables via debugfs.
+ The information for each soft-interface and used hard-interface can be
+ found under batman_adv/
+
+ If unsure, say Y.
+
+config BATMAN_ADV_DEBUG
+ bool "B.A.T.M.A.N. debugging"
+ depends on BATMAN_ADV_DEBUGFS
help
This is an option for use by developers; most people should
say N here. This enables compilation of support for
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index a83fc6c58d19..f724d3c98a81 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -24,14 +24,14 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
batman-adv-y += bitarray.o
batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
-batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o
+batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o
batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
batman-adv-y += fragmentation.o
batman-adv-y += gateway_client.o
batman-adv-y += gateway_common.o
batman-adv-y += hard-interface.o
batman-adv-y += hash.o
-batman-adv-y += icmp_socket.o
+batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o
batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o
batman-adv-y += main.o
batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 81dbbf569bd4..623d04302aa2 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -20,12 +20,18 @@
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/moduleparam.h>
+#include <linux/netlink.h>
#include <linux/printk.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/string.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
+#include "netlink.h"
char batadv_routing_algo[20] = "BATMAN_IV";
static struct hlist_head batadv_algo_list;
@@ -95,6 +101,7 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
return 0;
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
{
struct batadv_algo_ops *bat_algo_ops;
@@ -107,6 +114,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
return 0;
}
+#endif
static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
{
@@ -138,3 +146,65 @@ static struct kparam_string batadv_param_string_ra = {
module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
0644);
+
+/**
+ * batadv_algo_dump_entry - fill in information about one supported routing
+ * algorithm
+ * @msg: netlink message to be sent back
+ * @portid: Port to reply to
+ * @seq: Sequence number of message
+ * @bat_algo_ops: Algorithm to be dumped
+ *
+ * Return: Error number, or 0 on success
+ */
+static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_algo_ops *bat_algo_ops)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_algo_dump - fill in information about supported routing
+ * algorithms
+ * @msg: netlink message to be sent back
+ * @cb: Parameters to the netlink request
+ *
+ * Return: Length of reply message.
+ */
+int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_algo_ops *bat_algo_ops;
+ int skip = cb->args[0];
+ int i = 0;
+
+ hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
+ if (i++ < skip)
+ continue;
+
+ if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
+ bat_algo_ops)) {
+ i--;
+ break;
+ }
+ }
+
+ cb->args[0] = i;
+
+ return msg->len;
+}
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 860d773dd8fa..3b5b69cdd12b 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -22,7 +22,9 @@
#include <linux/types.h>
+struct netlink_callback;
struct seq_file;
+struct sk_buff;
extern char batadv_routing_algo[];
extern struct list_head batadv_hardif_list;
@@ -31,5 +33,6 @@ void batadv_algo_init(void);
int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb);
#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 19b0abd6c640..f00f666e2ccd 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -35,6 +35,7 @@
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/pkt_sched.h>
#include <linux/printk.h>
#include <linux/random.h>
@@ -48,12 +49,17 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "bitarray.h"
+#include "gateway_client.h"
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
+#include "netlink.h"
#include "network-coding.h"
#include "originator.h"
#include "packet.h"
@@ -318,17 +324,18 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
if (!orig_node->bat_iv.bcast_own_sum)
goto free_orig_node;
+ kref_get(&orig_node->refcount);
hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
batadv_choose_orig, orig_node,
&orig_node->hash_entry);
if (hash_added != 0)
- goto free_orig_node;
+ goto free_orig_node_hash;
return orig_node;
-free_orig_node:
- /* free twice, as batadv_orig_node_new sets refcount to 2 */
+free_orig_node_hash:
batadv_orig_node_put(orig_node);
+free_orig_node:
batadv_orig_node_put(orig_node);
return NULL;
@@ -528,36 +535,25 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
{
struct net_device *soft_iface;
- struct batadv_priv *bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
if (!forw_packet->if_incoming) {
pr_err("Error - can't forward packet: incoming iface not specified\n");
- goto out;
+ return;
}
soft_iface = forw_packet->if_incoming->soft_iface;
- bat_priv = netdev_priv(soft_iface);
if (WARN_ON(!forw_packet->if_outgoing))
- goto out;
+ return;
if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface))
- goto out;
+ return;
if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE)
- goto out;
-
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
+ return;
/* only for one specific outgoing interface */
batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing);
-
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
}
/**
@@ -685,19 +681,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
struct batadv_forw_packet *forw_packet_aggr;
unsigned char *skb_buff;
unsigned int skb_size;
+ atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left;
- /* own packet should always be scheduled */
- if (!own_packet) {
- if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "batman packet queue full\n");
- return;
- }
- }
-
- forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC);
+ forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing,
+ queue_left, bat_priv);
if (!forw_packet_aggr)
- goto out_nomem;
+ return;
if (atomic_read(&bat_priv->aggregated_ogms) &&
packet_len < BATADV_MAX_AGGREGATION_BYTES)
@@ -708,8 +697,11 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
skb_size += ETH_HLEN;
forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
- if (!forw_packet_aggr->skb)
- goto out_free_forw_packet;
+ if (!forw_packet_aggr->skb) {
+ batadv_forw_packet_free(forw_packet_aggr, true);
+ return;
+ }
+
forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
skb_reserve(forw_packet_aggr->skb, ETH_HLEN);
@@ -717,12 +709,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
forw_packet_aggr->packet_len = packet_len;
memcpy(skb_buff, packet_buff, packet_len);
- kref_get(&if_incoming->refcount);
- kref_get(&if_outgoing->refcount);
forw_packet_aggr->own = own_packet;
- forw_packet_aggr->if_incoming = if_incoming;
- forw_packet_aggr->if_outgoing = if_outgoing;
- forw_packet_aggr->num_packets = 0;
forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS;
forw_packet_aggr->send_time = send_time;
@@ -730,24 +717,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
if (direct_link)
forw_packet_aggr->direct_link_flags |= 1;
- /* add new packet to packet list */
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_add_head(&forw_packet_aggr->list, &bat_priv->forw_bat_list);
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
-
- /* start timer for this packet */
INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
batadv_iv_send_outstanding_bat_ogm_packet);
- queue_delayed_work(batadv_event_workqueue,
- &forw_packet_aggr->delayed_work,
- send_time - jiffies);
-
- return;
-out_free_forw_packet:
- kfree(forw_packet_aggr);
-out_nomem:
- if (!own_packet)
- atomic_inc(&bat_priv->batman_queue_left);
+
+ batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time);
}
/* aggregate a new packet into the existing ogm packet */
@@ -1292,7 +1265,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
*/
tq_iface_penalty = BATADV_TQ_MAX_VALUE;
if (if_outgoing && (if_incoming == if_outgoing) &&
- batadv_is_wifi_netdev(if_outgoing->net_dev))
+ batadv_is_wifi_hardif(if_outgoing))
tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
bat_priv);
@@ -1631,7 +1604,7 @@ out:
if (hardif_neigh)
batadv_hardif_neigh_put(hardif_neigh);
- kfree_skb(skb_priv);
+ consume_skb(skb_priv);
}
/**
@@ -1803,17 +1776,17 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
struct delayed_work *delayed_work;
struct batadv_forw_packet *forw_packet;
struct batadv_priv *bat_priv;
+ bool dropped = false;
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
delayed_work);
bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_del(&forw_packet->list);
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+ dropped = true;
goto out;
+ }
batadv_iv_ogm_emit(forw_packet);
@@ -1830,11 +1803,10 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
batadv_iv_ogm_schedule(forw_packet->if_incoming);
out:
- /* don't count own packet */
- if (!forw_packet->own)
- atomic_inc(&bat_priv->batman_queue_left);
-
- batadv_forw_packet_free(forw_packet);
+ /* do we get something for free()? */
+ if (batadv_forw_packet_steal(forw_packet,
+ &bat_priv->forw_bat_list_lock))
+ batadv_forw_packet_free(forw_packet, dropped);
}
static int batadv_iv_ogm_receive(struct sk_buff *skb,
@@ -1844,17 +1816,18 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
struct batadv_ogm_packet *ogm_packet;
u8 *packet_pos;
int ogm_offset;
- bool ret;
+ bool res;
+ int ret = NET_RX_DROP;
- ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
- if (!ret)
- return NET_RX_DROP;
+ res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
+ if (!res)
+ goto free_skb;
/* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
* that does not have B.A.T.M.A.N. IV enabled ?
*/
if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
- return NET_RX_DROP;
+ goto free_skb;
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -1875,10 +1848,18 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
ogm_packet = (struct batadv_ogm_packet *)packet_pos;
}
- kfree_skb(skb);
- return NET_RX_SUCCESS;
+ ret = NET_RX_SUCCESS;
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table
* @orig_node: the orig_node for which the neighbors are printed
@@ -1976,8 +1957,239 @@ next:
if (batman_count == 0)
seq_puts(seq, "No batman nodes in range ...\n");
}
+#endif
+
+/**
+ * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a
+ * given outgoing interface.
+ * @neigh_node: Neighbour of interest
+ * @if_outgoing: Outgoing interface of interest
+ * @tq_avg: Pointer of where to store the TQ average
+ *
+ * Return: False if no average TQ available, otherwise true.
+ */
+static bool
+batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_outgoing,
+ u8 *tq_avg)
+{
+ struct batadv_neigh_ifinfo *n_ifinfo;
+
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!n_ifinfo)
+ return false;
+
+ *tq_avg = n_ifinfo->bat_iv.tq_avg;
+ batadv_neigh_ifinfo_put(n_ifinfo);
+
+ return true;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a
+ * message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @neigh_node: Single hops neighbour
+ * @best: Is the best originator
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ bool best)
+{
+ void *hdr;
+ u8 tq_avg;
+ unsigned int last_seen_msecs;
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
+
+ if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg))
+ return 0;
+
+ if (if_outgoing != BATADV_IF_DEFAULT &&
+ if_outgoing != neigh_node->if_incoming)
+ return 0;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ orig_node->orig) ||
+ nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ neigh_node->addr) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ neigh_node->if_incoming->net_dev->ifindex) ||
+ nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs))
+ goto nla_put_failure;
+
+ if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
/**
+ * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @sub_s: Number of sub entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node, int *sub_s)
+{
+ struct batadv_neigh_node *neigh_node_best;
+ struct batadv_neigh_node *neigh_node;
+ int sub = 0;
+ bool best;
+ u8 tq_avg_best;
+
+ neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
+ if (!neigh_node_best)
+ goto out;
+
+ if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing,
+ &tq_avg_best))
+ goto out;
+
+ if (tq_avg_best == 0)
+ goto out;
+
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ if (sub++ < *sub_s)
+ continue;
+
+ best = (neigh_node == neigh_node_best);
+
+ if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq,
+ bat_priv, if_outgoing,
+ orig_node, neigh_node,
+ best)) {
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = sub - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ out:
+ if (neigh_node_best)
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a
+ * message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @head: Bucket to be dumped
+ * @idx_s: Number of entries to be skipped
+ * @sub: Number of sub entries to be skipped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct hlist_head *head, int *idx_s, int *sub)
+{
+ struct batadv_orig_node *orig_node;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv,
+ if_outgoing, orig_node,
+ sub)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ *sub = 0;
+ return 0;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump - Dump the originators into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ */
+static void
+batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int sub = cb->args[2];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_iv_ogm_orig_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, if_outgoing, head,
+ &idx, &sub))
+ break;
+
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+ cb->args[2] = sub;
+}
+
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+/**
* batadv_iv_hardif_neigh_print - print a single hop neighbour node
* @seq: neighbour table seq_file struct
* @hardif_neigh: hardif neighbour information
@@ -2027,37 +2239,43 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
if (batman_count == 0)
seq_puts(seq, "No batman nodes in range ...\n");
}
+#endif
/**
- * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
+ * batadv_iv_ogm_neigh_diff - calculate tq difference of two neighbors
* @neigh1: the first neighbor object of the comparison
* @if_outgoing1: outgoing interface for the first neighbor
* @neigh2: the second neighbor object of the comparison
* @if_outgoing2: outgoing interface for the second neighbor
+ * @diff: pointer to integer receiving the calculated difference
*
- * Return: a value less, equal to or greater than 0 if the metric via neigh1 is
- * lower, the same as or higher than the metric via neigh2
+ * The content of *@diff is only valid when this function returns true.
+ * It is less, equal to or greater than 0 if the metric via neigh1 is lower,
+ * the same as or higher than the metric via neigh2
+ *
+ * Return: true when the difference could be calculated, false otherwise
*/
-static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
- struct batadv_hard_iface *if_outgoing1,
- struct batadv_neigh_node *neigh2,
- struct batadv_hard_iface *if_outgoing2)
+static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2,
+ int *diff)
{
struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
u8 tq1, tq2;
- int diff;
+ bool ret = true;
neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
if (!neigh1_ifinfo || !neigh2_ifinfo) {
- diff = 0;
+ ret = false;
goto out;
}
tq1 = neigh1_ifinfo->bat_iv.tq_avg;
tq2 = neigh2_ifinfo->bat_iv.tq_avg;
- diff = tq1 - tq2;
+ *diff = (int)tq1 - (int)tq2;
out:
if (neigh1_ifinfo)
@@ -2065,6 +2283,162 @@ out:
if (neigh2_ifinfo)
batadv_neigh_ifinfo_put(neigh2_ifinfo);
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hardif_neigh: Neighbour to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ void *hdr;
+ unsigned int last_seen_msecs;
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ hardif_neigh->addr) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ hardif_neigh->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface
+ * into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @hard_iface: Hard interface to dump the neighbours for
+ * @idx_s: Number of entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface,
+ int *idx_s)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ int idx = 0;
+
+ hlist_for_each_entry_rcu(hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq,
+ hardif_neigh)) {
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ *idx_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @single_hardif: Limit dump to this hard interfaace
+ */
+static void
+batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *single_hardif)
+{
+ struct batadv_hard_iface *hard_iface;
+ int i_hardif = 0;
+ int i_hardif_s = cb->args[0];
+ int idx = cb->args[1];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ rcu_read_lock();
+ if (single_hardif) {
+ if (i_hardif_s == 0) {
+ if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv,
+ single_hardif,
+ &idx) == 0)
+ i_hardif++;
+ }
+ } else {
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list,
+ list) {
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ if (i_hardif++ < i_hardif_s)
+ continue;
+
+ if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv,
+ hard_iface, &idx)) {
+ i_hardif--;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = i_hardif;
+ cb->args[1] = idx;
+}
+
+/**
+ * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
+ * @neigh1: the first neighbor object of the comparison
+ * @if_outgoing1: outgoing interface for the first neighbor
+ * @neigh2: the second neighbor object of the comparison
+ * @if_outgoing2: outgoing interface for the second neighbor
+ *
+ * Return: a value less, equal to or greater than 0 if the metric via neigh1 is
+ * lower, the same as or higher than the metric via neigh2
+ */
+static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ bool ret;
+ int diff;
+
+ ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2,
+ if_outgoing2, &diff);
+ if (!ret)
+ return 0;
+
return diff;
}
@@ -2085,36 +2459,341 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2)
{
- struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
- u8 tq1, tq2;
bool ret;
+ int diff;
- neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
- neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2,
+ if_outgoing2, &diff);
+ if (!ret)
+ return false;
- /* we can't say that the metric is better */
- if (!neigh1_ifinfo || !neigh2_ifinfo) {
- ret = false;
+ ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD;
+ return ret;
+}
+
+static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
+{
+ /* begin scheduling originator messages on that interface */
+ batadv_iv_ogm_schedule(hard_iface);
+}
+
+static struct batadv_gw_node *
+batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_neigh_node *router;
+ struct batadv_neigh_ifinfo *router_ifinfo;
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+ u64 max_gw_factor = 0;
+ u64 tmp_gw_factor = 0;
+ u8 max_tq = 0;
+ u8 tq_avg;
+ struct batadv_orig_node *orig_node;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ orig_node = gw_node->orig_node;
+ router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ continue;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router,
+ BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto next;
+
+ if (!kref_get_unless_zero(&gw_node->refcount))
+ goto next;
+
+ tq_avg = router_ifinfo->bat_iv.tq_avg;
+
+ switch (atomic_read(&bat_priv->gw.sel_class)) {
+ case 1: /* fast connection */
+ tmp_gw_factor = tq_avg * tq_avg;
+ tmp_gw_factor *= gw_node->bandwidth_down;
+ tmp_gw_factor *= 100 * 100;
+ tmp_gw_factor >>= 18;
+
+ if ((tmp_gw_factor > max_gw_factor) ||
+ ((tmp_gw_factor == max_gw_factor) &&
+ (tq_avg > max_tq))) {
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
+ curr_gw = gw_node;
+ kref_get(&curr_gw->refcount);
+ }
+ break;
+
+ default: /* 2: stable connection (use best statistic)
+ * 3: fast-switch (use best statistic but change as
+ * soon as a better gateway appears)
+ * XX: late-switch (use best statistic but change as
+ * soon as a better gateway appears which has
+ * $routing_class more tq points)
+ */
+ if (tq_avg > max_tq) {
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
+ curr_gw = gw_node;
+ kref_get(&curr_gw->refcount);
+ }
+ break;
+ }
+
+ if (tq_avg > max_tq)
+ max_tq = tq_avg;
+
+ if (tmp_gw_factor > max_gw_factor)
+ max_gw_factor = tmp_gw_factor;
+
+ batadv_gw_node_put(gw_node);
+
+next:
+ batadv_neigh_node_put(router);
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ }
+ rcu_read_unlock();
+
+ return curr_gw;
+}
+
+static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *curr_gw_orig,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL;
+ struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL;
+ struct batadv_neigh_node *router_gw = NULL;
+ struct batadv_neigh_node *router_orig = NULL;
+ u8 gw_tq_avg, orig_tq_avg;
+ bool ret = false;
+
+ /* dynamic re-election is performed only on fast or late switch */
+ if (atomic_read(&bat_priv->gw.sel_class) <= 2)
+ return false;
+
+ router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT);
+ if (!router_gw) {
+ ret = true;
goto out;
}
- tq1 = neigh1_ifinfo->bat_iv.tq_avg;
- tq2 = neigh2_ifinfo->bat_iv.tq_avg;
- ret = (tq1 - tq2) > -BATADV_TQ_SIMILARITY_THRESHOLD;
+ router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw,
+ BATADV_IF_DEFAULT);
+ if (!router_gw_ifinfo) {
+ ret = true;
+ goto out;
+ }
+
+ router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router_orig)
+ goto out;
+
+ router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig,
+ BATADV_IF_DEFAULT);
+ if (!router_orig_ifinfo)
+ goto out;
+
+ gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg;
+ orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg;
+ /* the TQ value has to be better */
+ if (orig_tq_avg < gw_tq_avg)
+ goto out;
+
+ /* if the routing class is greater than 3 the value tells us how much
+ * greater the TQ value of the new gateway must be
+ */
+ if ((atomic_read(&bat_priv->gw.sel_class) > 3) &&
+ (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class)))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n",
+ gw_tq_avg, orig_tq_avg);
+
+ ret = true;
out:
- if (neigh1_ifinfo)
- batadv_neigh_ifinfo_put(neigh1_ifinfo);
- if (neigh2_ifinfo)
- batadv_neigh_ifinfo_put(neigh2_ifinfo);
+ if (router_gw_ifinfo)
+ batadv_neigh_ifinfo_put(router_gw_ifinfo);
+ if (router_orig_ifinfo)
+ batadv_neigh_ifinfo_put(router_orig_ifinfo);
+ if (router_gw)
+ batadv_neigh_node_put(router_gw);
+ if (router_orig)
+ batadv_neigh_node_put(router_orig);
return ret;
}
-static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+/* fails if orig_node has no router */
+static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv,
+ struct seq_file *seq,
+ const struct batadv_gw_node *gw_node)
{
- /* begin scheduling originator messages on that interface */
- batadv_iv_ogm_schedule(hard_iface);
+ struct batadv_gw_node *curr_gw;
+ struct batadv_neigh_node *router;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ int ret = -1;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n",
+ (curr_gw == gw_node ? "=>" : " "),
+ gw_node->orig_node->orig,
+ router_ifinfo->bat_iv.tq_avg, router->addr,
+ router->if_incoming->net_dev->name,
+ gw_node->bandwidth_down / 10,
+ gw_node->bandwidth_down % 10,
+ gw_node->bandwidth_up / 10,
+ gw_node->bandwidth_up % 10);
+ ret = seq_has_overflowed(seq) ? -1 : 0;
+
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+ return ret;
+}
+
+static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
+ struct seq_file *seq)
+{
+ struct batadv_gw_node *gw_node;
+ int gw_count = 0;
+
+ seq_puts(seq,
+ " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ /* fails if orig_node has no router */
+ if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
+ continue;
+
+ gw_count++;
+ }
+ rcu_read_unlock();
+
+ if (gw_count == 0)
+ seq_puts(seq, "No gateways in range ...\n");
+}
+#endif
+
+/**
+ * batadv_iv_gw_dump_entry - Dump a gateway into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @gw_node: Gateway to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_gw_node *gw_node)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_neigh_node *router;
+ struct batadv_gw_node *curr_gw;
+ int ret = -EINVAL;
+ void *hdr;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ ret = -EMSGSIZE;
+
+ if (curr_gw == gw_node)
+ if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ gw_node->orig_node->orig) ||
+ nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) ||
+ nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN,
+ router->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ router->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
+ gw_node->bandwidth_down) ||
+ nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP,
+ gw_node->bandwidth_up)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+ return ret;
+}
+
+/**
+ * batadv_iv_gw_dump - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ */
+static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv)
+{
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_gw_node *gw_node;
+ int idx_skip = cb->args[0];
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ if (idx++ < idx_skip)
+ continue;
+
+ if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
+ bat_priv, gw_node)) {
+ idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ idx_skip = idx;
+unlock:
+ rcu_read_unlock();
+
+ cb->args[0] = idx_skip;
}
static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
@@ -2129,14 +2808,28 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.neigh = {
.cmp = batadv_iv_ogm_neigh_cmp,
.is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
.print = batadv_iv_neigh_print,
+#endif
+ .dump = batadv_iv_ogm_neigh_dump,
},
.orig = {
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
.print = batadv_iv_ogm_orig_print,
+#endif
+ .dump = batadv_iv_ogm_orig_dump,
.free = batadv_iv_ogm_orig_free,
.add_if = batadv_iv_ogm_orig_add_if,
.del_if = batadv_iv_ogm_orig_del_if,
},
+ .gw = {
+ .get_best_gw_node = batadv_iv_gw_get_best_gw_node,
+ .is_eligible = batadv_iv_gw_is_eligible,
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+ .print = batadv_iv_gw_print,
+#endif
+ .dump = batadv_iv_gw_dump,
+ },
};
int __init batadv_iv_init(void)
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 0366cbf5e444..2ac612d7bab4 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -21,24 +21,38 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/cache.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "bat_v_elp.h"
#include "bat_v_ogm.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
#include "hard-interface.h"
#include "hash.h"
+#include "log.h"
+#include "netlink.h"
#include "originator.h"
#include "packet.h"
+struct sk_buff;
+
static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
@@ -115,6 +129,7 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
batadv_v_elp_throughput_metric_update);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_v_orig_print_neigh - print neighbors for the originator table
* @orig_node: the orig_node for which the neighbors are printed
@@ -198,8 +213,142 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
if (batman_count == 0)
seq_puts(seq, "No batman nodes in range ...\n");
}
+#endif
+
+/**
+ * batadv_v_neigh_dump_neigh - Dump a neighbour into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hardif_neigh: Neighbour to dump
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ void *hdr;
+ unsigned int last_seen_msecs;
+ u32 throughput;
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
+ throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+ throughput = throughput * 100;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_NEIGHBORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ hardif_neigh->addr) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ hardif_neigh->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs) ||
+ nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
/**
+ * batadv_v_neigh_dump_hardif - Dump the neighbours of a hard interface into
+ * a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @hard_iface: The hard interface to be dumped
+ * @idx_s: Entries to be skipped
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface,
+ int *idx_s)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ int idx = 0;
+
+ hlist_for_each_entry_rcu(hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) {
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ *idx_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_v_neigh_dump - Dump the neighbours of a hard interface into a
+ * message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @single_hardif: Limit dumping to this hard interface
+ */
+static void
+batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *single_hardif)
+{
+ struct batadv_hard_iface *hard_iface;
+ int i_hardif = 0;
+ int i_hardif_s = cb->args[0];
+ int idx = cb->args[1];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ rcu_read_lock();
+ if (single_hardif) {
+ if (i_hardif_s == 0) {
+ if (batadv_v_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, single_hardif,
+ &idx) == 0)
+ i_hardif++;
+ }
+ } else {
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ if (i_hardif++ < i_hardif_s)
+ continue;
+
+ if (batadv_v_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, hard_iface,
+ &idx)) {
+ i_hardif--;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = i_hardif;
+ cb->args[1] = idx;
+}
+
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+/**
* batadv_v_orig_print - print the originator table
* @bat_priv: the bat priv with all the soft interface information
* @seq: debugfs table seq_file struct
@@ -265,6 +414,205 @@ next:
if (batman_count == 0)
seq_puts(seq, "No batman nodes in range ...\n");
}
+#endif
+
+/**
+ * batadv_v_orig_dump_subentry - Dump an originator subentry into a
+ * message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @neigh_node: Single hops neighbour
+ * @best: Is the best originator
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ bool best)
+{
+ struct batadv_neigh_ifinfo *n_ifinfo;
+ unsigned int last_seen_msecs;
+ u32 throughput;
+ void *hdr;
+
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!n_ifinfo)
+ return 0;
+
+ throughput = n_ifinfo->bat_v.throughput * 100;
+
+ batadv_neigh_ifinfo_put(n_ifinfo);
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
+
+ if (if_outgoing != BATADV_IF_DEFAULT &&
+ if_outgoing != neigh_node->if_incoming)
+ return 0;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_ORIGINATORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) ||
+ nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ neigh_node->addr) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ neigh_node->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs))
+ goto nla_put_failure;
+
+ if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_v_orig_dump_entry - Dump an originator entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @sub_s: Number of sub entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node, int *sub_s)
+{
+ struct batadv_neigh_node *neigh_node_best;
+ struct batadv_neigh_node *neigh_node;
+ int sub = 0;
+ bool best;
+
+ neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
+ if (!neigh_node_best)
+ goto out;
+
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ if (sub++ < *sub_s)
+ continue;
+
+ best = (neigh_node == neigh_node_best);
+
+ if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv,
+ if_outgoing, orig_node,
+ neigh_node, best)) {
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = sub - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ out:
+ if (neigh_node_best)
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_v_orig_dump_bucket - Dump an originator bucket into a
+ * message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @head: Bucket to be dumped
+ * @idx_s: Number of entries to be skipped
+ * @sub: Number of sub entries to be skipped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct hlist_head *head, int *idx_s, int *sub)
+{
+ struct batadv_orig_node *orig_node;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv,
+ if_outgoing, orig_node, sub)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ *sub = 0;
+ return 0;
+}
+
+/**
+ * batadv_v_orig_dump - Dump the originators into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ */
+static void
+batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int sub = cb->args[2];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_v_orig_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, if_outgoing, head, &idx,
+ &sub))
+ break;
+
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+ cb->args[2] = sub;
+}
static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing1,
@@ -320,6 +668,365 @@ err_ifinfo1:
return ret;
}
+static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
+ char *buff, size_t count)
+{
+ u32 old_class, class;
+
+ if (!batadv_parse_throughput(bat_priv->soft_iface, buff,
+ "B.A.T.M.A.N. V GW selection class",
+ &class))
+ return -EINVAL;
+
+ old_class = atomic_read(&bat_priv->gw.sel_class);
+ atomic_set(&bat_priv->gw.sel_class, class);
+
+ if (old_class != class)
+ batadv_gw_reselect(bat_priv);
+
+ return count;
+}
+
+static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff)
+{
+ u32 class = atomic_read(&bat_priv->gw.sel_class);
+
+ return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10);
+}
+
+/**
+ * batadv_v_gw_throughput_get - retrieve the GW-bandwidth for a given GW
+ * @gw_node: the GW to retrieve the metric for
+ * @bw: the pointer where the metric will be stored. The metric is computed as
+ * the minimum between the GW advertised throughput and the path throughput to
+ * it in the mesh
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_orig_node *orig_node;
+ struct batadv_neigh_node *router;
+ int ret = -1;
+
+ orig_node = gw_node->orig_node;
+ router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ /* the GW metric is computed as the minimum between the path throughput
+ * to reach the GW itself and the advertised bandwidth.
+ * This gives us an approximation of the effective throughput that the
+ * client can expect via this particular GW node
+ */
+ *bw = router_ifinfo->bat_v.throughput;
+ *bw = min_t(u32, *bw, gw_node->bandwidth_down);
+
+ ret = 0;
+out:
+ if (router)
+ batadv_neigh_node_put(router);
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+
+ return ret;
+}
+
+/**
+ * batadv_v_gw_get_best_gw_node - retrieve the best GW node
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: the GW node having the best GW-metric, NULL if no GW is known
+ */
+static struct batadv_gw_node *
+batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+ u32 max_bw = 0, bw;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ if (!kref_get_unless_zero(&gw_node->refcount))
+ continue;
+
+ if (batadv_v_gw_throughput_get(gw_node, &bw) < 0)
+ goto next;
+
+ if (curr_gw && (bw <= max_bw))
+ goto next;
+
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
+
+ curr_gw = gw_node;
+ kref_get(&curr_gw->refcount);
+ max_bw = bw;
+
+next:
+ batadv_gw_node_put(gw_node);
+ }
+ rcu_read_unlock();
+
+ return curr_gw;
+}
+
+/**
+ * batadv_v_gw_is_eligible - check if a originator would be selected as GW
+ * @bat_priv: the bat priv with all the soft interface information
+ * @curr_gw_orig: originator representing the currently selected GW
+ * @orig_node: the originator representing the new candidate
+ *
+ * Return: true if orig_node can be selected as current GW, false otherwise
+ */
+static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *curr_gw_orig,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_gw_node *curr_gw, *orig_gw = NULL;
+ u32 gw_throughput, orig_throughput, threshold;
+ bool ret = false;
+
+ threshold = atomic_read(&bat_priv->gw.sel_class);
+
+ curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig);
+ if (!curr_gw) {
+ ret = true;
+ goto out;
+ }
+
+ if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) {
+ ret = true;
+ goto out;
+ }
+
+ orig_gw = batadv_gw_node_get(bat_priv, orig_node);
+ if (!orig_node)
+ goto out;
+
+ if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0)
+ goto out;
+
+ if (orig_throughput < gw_throughput)
+ goto out;
+
+ if ((orig_throughput - gw_throughput) < threshold)
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n",
+ gw_throughput, orig_throughput);
+
+ ret = true;
+out:
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
+ if (orig_gw)
+ batadv_gw_node_put(orig_gw);
+
+ return ret;
+}
+
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+/* fails if orig_node has no router */
+static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv,
+ struct seq_file *seq,
+ const struct batadv_gw_node *gw_node)
+{
+ struct batadv_gw_node *curr_gw;
+ struct batadv_neigh_node *router;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ int ret = -1;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n",
+ (curr_gw == gw_node ? "=>" : " "),
+ gw_node->orig_node->orig,
+ router_ifinfo->bat_v.throughput / 10,
+ router_ifinfo->bat_v.throughput % 10, router->addr,
+ router->if_incoming->net_dev->name,
+ gw_node->bandwidth_down / 10,
+ gw_node->bandwidth_down % 10,
+ gw_node->bandwidth_up / 10,
+ gw_node->bandwidth_up % 10);
+ ret = seq_has_overflowed(seq) ? -1 : 0;
+
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+ return ret;
+}
+
+/**
+ * batadv_v_gw_print - print the gateway list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: gateway table seq_file struct
+ */
+static void batadv_v_gw_print(struct batadv_priv *bat_priv,
+ struct seq_file *seq)
+{
+ struct batadv_gw_node *gw_node;
+ int gw_count = 0;
+
+ seq_puts(seq,
+ " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ /* fails if orig_node has no router */
+ if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
+ continue;
+
+ gw_count++;
+ }
+ rcu_read_unlock();
+
+ if (gw_count == 0)
+ seq_puts(seq, "No gateways in range ...\n");
+}
+#endif
+
+/**
+ * batadv_v_gw_dump_entry - Dump a gateway into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @gw_node: Gateway to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_gw_node *gw_node)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_neigh_node *router;
+ struct batadv_gw_node *curr_gw;
+ int ret = -EINVAL;
+ void *hdr;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ ret = -EMSGSIZE;
+
+ if (curr_gw == gw_node) {
+ if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+ }
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ gw_node->orig_node->orig)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT,
+ router_ifinfo->bat_v.throughput)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ router->if_incoming->net_dev->name)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
+ gw_node->bandwidth_down)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+ return ret;
+}
+
+/**
+ * batadv_v_gw_dump - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ */
+static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv)
+{
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_gw_node *gw_node;
+ int idx_skip = cb->args[0];
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ if (idx++ < idx_skip)
+ continue;
+
+ if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
+ bat_priv, gw_node)) {
+ idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ idx_skip = idx;
+unlock:
+ rcu_read_unlock();
+
+ cb->args[0] = idx_skip;
+}
+
static struct batadv_algo_ops batadv_batman_v __read_mostly = {
.name = "BATMAN_V",
.iface = {
@@ -333,10 +1040,26 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
.hardif_init = batadv_v_hardif_neigh_init,
.cmp = batadv_v_neigh_cmp,
.is_similar_or_better = batadv_v_neigh_is_sob,
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
.print = batadv_v_neigh_print,
+#endif
+ .dump = batadv_v_neigh_dump,
},
.orig = {
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
.print = batadv_v_orig_print,
+#endif
+ .dump = batadv_v_orig_dump,
+ },
+ .gw = {
+ .store_sel_class = batadv_v_store_sel_class,
+ .show_sel_class = batadv_v_show_sel_class,
+ .get_best_gw_node = batadv_v_gw_get_best_gw_node,
+ .is_eligible = batadv_v_gw_is_eligible,
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+ .print = batadv_v_gw_print,
+#endif
+ .dump = batadv_v_gw_dump,
},
};
@@ -363,7 +1086,16 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
*/
int batadv_v_mesh_init(struct batadv_priv *bat_priv)
{
- return batadv_v_ogm_init(bat_priv);
+ int ret = 0;
+
+ ret = batadv_v_ogm_init(bat_priv);
+ if (ret < 0)
+ return ret;
+
+ /* set default throughput difference threshold to 5Mbps */
+ atomic_set(&bat_priv->gw.sel_class, 50);
+
+ return 0;
}
/**
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index ee08540ce503..f2fb2f05b6bf 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -75,6 +75,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
{
struct batadv_hard_iface *hard_iface = neigh->if_incoming;
struct ethtool_link_ksettings link_settings;
+ struct net_device *real_netdev;
struct station_info sinfo;
u32 throughput;
int ret;
@@ -89,23 +90,27 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
/* if this is a wireless device, then ask its throughput through
* cfg80211 API
*/
- if (batadv_is_wifi_netdev(hard_iface->net_dev)) {
- if (hard_iface->net_dev->ieee80211_ptr) {
- ret = cfg80211_get_station(hard_iface->net_dev,
- neigh->addr, &sinfo);
- if (ret == -ENOENT) {
- /* Node is not associated anymore! It would be
- * possible to delete this neighbor. For now set
- * the throughput metric to 0.
- */
- return 0;
- }
- if (!ret)
- return sinfo.expected_throughput / 100;
+ if (batadv_is_wifi_hardif(hard_iface)) {
+ if (!batadv_is_cfg80211_hardif(hard_iface))
+ /* unsupported WiFi driver version */
+ goto default_throughput;
+
+ real_netdev = batadv_get_real_netdev(hard_iface->net_dev);
+ if (!real_netdev)
+ goto default_throughput;
+
+ ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo);
+
+ dev_put(real_netdev);
+ if (ret == -ENOENT) {
+ /* Node is not associated anymore! It would be
+ * possible to delete this neighbor. For now set
+ * the throughput metric to 0.
+ */
+ return 0;
}
-
- /* unsupported WiFi driver version */
- goto default_throughput;
+ if (!ret)
+ return sinfo.expected_throughput / 100;
}
/* if not a wifi interface, check if this device provides data via
@@ -187,7 +192,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
int elp_skb_len;
/* this probing routine is for Wifi neighbours only */
- if (!batadv_is_wifi_netdev(hard_iface->net_dev))
+ if (!batadv_is_wifi_hardif(hard_iface))
return true;
/* probe the neighbor only if no unicast packets have been sent
@@ -352,7 +357,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
/* warn the user (again) if there is no throughput data is available */
hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT;
- if (batadv_is_wifi_netdev(hard_iface->net_dev))
+ if (batadv_is_wifi_hardif(hard_iface))
hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
@@ -492,20 +497,21 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_elp_packet *elp_packet;
struct batadv_hard_iface *primary_if;
struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
- bool ret;
+ bool res;
+ int ret = NET_RX_DROP;
- ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
- if (!ret)
- return NET_RX_DROP;
+ res = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
+ if (!res)
+ goto free_skb;
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
- return NET_RX_DROP;
+ goto free_skb;
/* did we receive a B.A.T.M.A.N. V ELP packet on an interface
* that does not have B.A.T.M.A.N. V ELP enabled ?
*/
if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
- return NET_RX_DROP;
+ goto free_skb;
elp_packet = (struct batadv_elp_packet *)skb->data;
@@ -516,14 +522,19 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
- goto out;
+ goto free_skb;
batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
elp_packet);
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- consume_skb(skb);
- return NET_RX_SUCCESS;
+ ret = NET_RX_SUCCESS;
+ batadv_hardif_put(primary_if);
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
}
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 6fbba4eb0617..38b9aab83fc0 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -73,13 +73,12 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
if (!orig_node)
return NULL;
+ kref_get(&orig_node->refcount);
hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
batadv_choose_orig, orig_node,
&orig_node->hash_entry);
if (hash_added != 0) {
- /* orig_node->refcounter is initialised to 2 by
- * batadv_orig_node_new()
- */
+ /* remove refcnt for newly created orig_node and hash entry */
batadv_orig_node_put(orig_node);
batadv_orig_node_put(orig_node);
orig_node = NULL;
@@ -141,6 +140,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
unsigned char *ogm_buff, *pkt_buff;
int ogm_buff_len;
u16 tvlv_len = 0;
+ int ret;
bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
@@ -183,6 +183,31 @@ static void batadv_v_ogm_send(struct work_struct *work)
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
+ ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL);
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselve on %s surpressed: %s\n",
+ hard_iface->net_dev->name, type);
+
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
ogm_packet->orig, ntohl(ogm_packet->seqno),
@@ -402,7 +427,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo;
struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
bool protection_started = false;
int ret = -EINVAL;
@@ -487,7 +512,7 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_neigh_node *router = NULL;
- struct batadv_orig_node *orig_neigh_node = NULL;
+ struct batadv_orig_node *orig_neigh_node;
struct batadv_neigh_node *orig_neigh_router = NULL;
struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
u32 router_throughput, neigh_throughput;
@@ -652,6 +677,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *hard_iface;
struct batadv_ogm2_packet *ogm_packet;
u32 ogm_throughput, link_throughput, path_throughput;
+ int ret;
ethhdr = eth_hdr(skb);
ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
@@ -717,6 +743,35 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
+ ret = batadv_hardif_no_broadcast(hard_iface,
+ ogm_packet->orig,
+ hardif_neigh->orig);
+
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s surpressed: %s\n",
+ ogm_packet->orig, hard_iface->net_dev->name,
+ type);
+
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
orig_node, neigh_node,
if_incoming, hard_iface);
@@ -755,18 +810,18 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
* B.A.T.M.A.N. V enabled ?
*/
if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
- return NET_RX_DROP;
+ goto free_skb;
if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
- return NET_RX_DROP;
+ goto free_skb;
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
- return NET_RX_DROP;
+ goto free_skb;
ogm_packet = (struct batadv_ogm2_packet *)skb->data;
if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
- return NET_RX_DROP;
+ goto free_skb;
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -787,7 +842,12 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
}
ret = NET_RX_SUCCESS;
- consume_skb(skb);
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
return ret;
}
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index ad2ffe16d29f..e7f690b571ea 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -35,6 +35,7 @@
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
@@ -45,12 +46,18 @@
#include <linux/string.h>
#include <linux/workqueue.h>
#include <net/arp.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
+#include "netlink.h"
#include "originator.h"
#include "packet.h"
+#include "soft-interface.h"
#include "sysfs.h"
#include "translation-table.h"
@@ -519,11 +526,9 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
atomic_set(&entry->wait_periods, 0);
ether_addr_copy(entry->orig, orig);
INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report);
-
- /* one for the hash, one for returning */
kref_init(&entry->refcount);
- kref_get(&entry->refcount);
+ kref_get(&entry->refcount);
hash_added = batadv_hash_add(bat_priv->bla.backbone_hash,
batadv_compare_backbone_gw,
batadv_choose_backbone_gw, entry,
@@ -711,12 +716,13 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
claim->lasttime = jiffies;
kref_get(&backbone_gw->refcount);
claim->backbone_gw = backbone_gw;
-
kref_init(&claim->refcount);
- kref_get(&claim->refcount);
+
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"bla_add_claim(): adding new entry %pM, vid %d to hash ...\n",
mac, BATADV_PRINT_VID(vid));
+
+ kref_get(&claim->refcount);
hash_added = batadv_hash_add(bat_priv->bla.claim_hash,
batadv_compare_claim,
batadv_choose_claim, claim,
@@ -1148,7 +1154,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv,
/* Let the loopdetect frames on the mesh in any case. */
if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT)
- return 0;
+ return false;
/* check if it is a claim frame. */
ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst,
@@ -1990,6 +1996,7 @@ out:
return ret;
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_bla_claim_table_seq_print_text - print the claim table in a seq file
* @seq: seq file to print on
@@ -2050,8 +2057,172 @@ out:
batadv_hardif_put(primary_if);
return 0;
}
+#endif
+
+/**
+ * batadv_bla_claim_dump_entry - dump one entry of the claim table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @claim: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_bla_claim *claim)
+{
+ u8 *primary_addr = primary_if->net_dev->dev_addr;
+ u16 backbone_crc;
+ bool is_own;
+ void *hdr;
+ int ret = -EINVAL;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ is_own = batadv_compare_eth(claim->backbone_gw->orig,
+ primary_addr);
+
+ spin_lock_bh(&claim->backbone_gw->crc_lock);
+ backbone_crc = claim->backbone_gw->crc;
+ spin_unlock_bh(&claim->backbone_gw->crc_lock);
+
+ if (is_own)
+ if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) ||
+ nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
+ claim->backbone_gw->orig) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+ backbone_crc)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_bla_claim_dump_bucket - dump one bucket of the claim table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @head: bucket to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: always 0.
+ */
+static int
+batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hard_iface *primary_if,
+ struct hlist_head *head, int *idx_skip)
+{
+ struct batadv_bla_claim *claim;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ if (idx++ < *idx_skip)
+ continue;
+ if (batadv_bla_claim_dump_entry(msg, portid, seq,
+ primary_if, claim)) {
+ *idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ *idx_skip = idx;
+unlock:
+ rcu_read_unlock();
+ return 0;
+}
/**
+ * batadv_bla_claim_dump - dump claim table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct batadv_hashtable *hash;
+ struct batadv_priv *bat_priv;
+ int bucket = cb->args[0];
+ struct hlist_head *head;
+ int idx = cb->args[1];
+ int ifindex;
+ int ret = 0;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+ hash = bat_priv->bla.claim_hash;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_bla_claim_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq,
+ primary_if, head, &idx))
+ break;
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ ret = msg->len;
+
+out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ return ret;
+}
+
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+/**
* batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq
* file
* @seq: seq file to print on
@@ -2114,3 +2285,168 @@ out:
batadv_hardif_put(primary_if);
return 0;
}
+#endif
+
+/**
+ * batadv_bla_backbone_dump_entry - dump one entry of the backbone table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @backbone_gw: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ u8 *primary_addr = primary_if->net_dev->dev_addr;
+ u16 backbone_crc;
+ bool is_own;
+ int msecs;
+ void *hdr;
+ int ret = -EINVAL;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ is_own = batadv_compare_eth(backbone_gw->orig, primary_addr);
+
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_crc = backbone_gw->crc;
+ spin_unlock_bh(&backbone_gw->crc_lock);
+
+ msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime);
+
+ if (is_own)
+ if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
+ backbone_gw->orig) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+ backbone_crc) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @head: bucket to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: always 0.
+ */
+static int
+batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hard_iface *primary_if,
+ struct hlist_head *head, int *idx_skip)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (idx++ < *idx_skip)
+ continue;
+ if (batadv_bla_backbone_dump_entry(msg, portid, seq,
+ primary_if, backbone_gw)) {
+ *idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ *idx_skip = idx;
+unlock:
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * batadv_bla_backbone_dump - dump backbone table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct batadv_hashtable *hash;
+ struct batadv_priv *bat_priv;
+ int bucket = cb->args[0];
+ struct hlist_head *head;
+ int idx = cb->args[1];
+ int ifindex;
+ int ret = 0;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+ hash = bat_priv->bla.backbone_hash;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_bla_backbone_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq,
+ primary_if, head, &idx))
+ break;
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ ret = msg->len;
+
+out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ return ret;
+}
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 0f01daeb359e..1ae93e46fb98 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -23,6 +23,7 @@
#include <linux/types.h>
struct net_device;
+struct netlink_callback;
struct seq_file;
struct sk_buff;
@@ -35,8 +36,10 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
int hdr_size);
int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
void *offset);
+int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb);
bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid);
bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
@@ -47,7 +50,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
void batadv_bla_status_update(struct net_device *net_dev);
int batadv_bla_init(struct batadv_priv *bat_priv);
void batadv_bla_free(struct batadv_priv *bat_priv);
-
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
#define BATADV_BLA_CRC_INIT 0
#else /* ifdef CONFIG_BATMAN_ADV_BLA */
@@ -112,6 +115,18 @@ static inline void batadv_bla_free(struct batadv_priv *bat_priv)
{
}
+static inline int batadv_bla_claim_dump(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int batadv_bla_backbone_dump(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ return -EOPNOTSUPP;
+}
+
#endif /* ifdef CONFIG_BATMAN_ADV_BLA */
#endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 1d68b6e63b96..77925504379d 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -31,6 +31,7 @@
#include <linux/stddef.h>
#include <linux/stringify.h>
#include <linux/sysfs.h>
+#include <net/net_namespace.h>
#include "bat_algo.h"
#include "bridge_loop_avoidance.h"
@@ -185,7 +186,7 @@ struct batadv_debuginfo batadv_debuginfo_##_name = { \
/* the following attributes are general and therefore they will be directly
* placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs
*/
-static BATADV_DEBUGINFO(routing_algos, S_IRUGO, batadv_algorithms_open);
+static BATADV_DEBUGINFO(routing_algos, 0444, batadv_algorithms_open);
static struct batadv_debuginfo *batadv_general_debuginfos[] = {
&batadv_debuginfo_routing_algos,
@@ -193,26 +194,24 @@ static struct batadv_debuginfo *batadv_general_debuginfos[] = {
};
/* The following attributes are per soft interface */
-static BATADV_DEBUGINFO(neighbors, S_IRUGO, neighbors_open);
-static BATADV_DEBUGINFO(originators, S_IRUGO, batadv_originators_open);
-static BATADV_DEBUGINFO(gateways, S_IRUGO, batadv_gateways_open);
-static BATADV_DEBUGINFO(transtable_global, S_IRUGO,
- batadv_transtable_global_open);
+static BATADV_DEBUGINFO(neighbors, 0444, neighbors_open);
+static BATADV_DEBUGINFO(originators, 0444, batadv_originators_open);
+static BATADV_DEBUGINFO(gateways, 0444, batadv_gateways_open);
+static BATADV_DEBUGINFO(transtable_global, 0444, batadv_transtable_global_open);
#ifdef CONFIG_BATMAN_ADV_BLA
-static BATADV_DEBUGINFO(bla_claim_table, S_IRUGO, batadv_bla_claim_table_open);
-static BATADV_DEBUGINFO(bla_backbone_table, S_IRUGO,
+static BATADV_DEBUGINFO(bla_claim_table, 0444, batadv_bla_claim_table_open);
+static BATADV_DEBUGINFO(bla_backbone_table, 0444,
batadv_bla_backbone_table_open);
#endif
#ifdef CONFIG_BATMAN_ADV_DAT
-static BATADV_DEBUGINFO(dat_cache, S_IRUGO, batadv_dat_cache_open);
+static BATADV_DEBUGINFO(dat_cache, 0444, batadv_dat_cache_open);
#endif
-static BATADV_DEBUGINFO(transtable_local, S_IRUGO,
- batadv_transtable_local_open);
+static BATADV_DEBUGINFO(transtable_local, 0444, batadv_transtable_local_open);
#ifdef CONFIG_BATMAN_ADV_NC
-static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open);
+static BATADV_DEBUGINFO(nc_nodes, 0444, batadv_nc_nodes_open);
#endif
#ifdef CONFIG_BATMAN_ADV_MCAST
-static BATADV_DEBUGINFO(mcast_flags, S_IRUGO, batadv_mcast_flags_open);
+static BATADV_DEBUGINFO(mcast_flags, 0444, batadv_mcast_flags_open);
#endif
static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
@@ -252,7 +251,7 @@ struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \
}, \
}
-static BATADV_HARDIF_DEBUGINFO(originators, S_IRUGO,
+static BATADV_HARDIF_DEBUGINFO(originators, 0444,
batadv_originators_hardif_open);
static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
@@ -305,12 +304,16 @@ void batadv_debugfs_destroy(void)
*/
int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
{
+ struct net *net = dev_net(hard_iface->net_dev);
struct batadv_debuginfo **bat_debug;
struct dentry *file;
if (!batadv_debugfs)
goto out;
+ if (net != &init_net)
+ return 0;
+
hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name,
batadv_debugfs);
if (!hard_iface->debug_dir)
@@ -341,6 +344,11 @@ out:
*/
void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
{
+ struct net *net = dev_net(hard_iface->net_dev);
+
+ if (net != &init_net)
+ return;
+
if (batadv_debugfs) {
debugfs_remove_recursive(hard_iface->debug_dir);
hard_iface->debug_dir = NULL;
@@ -351,11 +359,15 @@ int batadv_debugfs_add_meshif(struct net_device *dev)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
struct batadv_debuginfo **bat_debug;
+ struct net *net = dev_net(dev);
struct dentry *file;
if (!batadv_debugfs)
goto out;
+ if (net != &init_net)
+ return 0;
+
bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs);
if (!bat_priv->debug_dir)
goto out;
@@ -392,6 +404,10 @@ out:
void batadv_debugfs_del_meshif(struct net_device *dev)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+
+ if (net != &init_net)
+ return;
batadv_debug_log_cleanup(bat_priv);
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 1ab4e2e63afc..e49121ee55f6 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -20,13 +20,11 @@
#include "main.h"
-#include <linux/kconfig.h>
-
struct net_device;
#define BATADV_DEBUGFS_SUBDIR "batman_adv"
-#if IS_ENABLED(CONFIG_DEBUG_FS)
+#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS)
void batadv_debugfs_init(void);
void batadv_debugfs_destroy(void);
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index b1cc8bfe11ac..49576c5a3fe3 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -343,8 +343,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
ether_addr_copy(dat_entry->mac_addr, mac_addr);
dat_entry->last_update = jiffies;
kref_init(&dat_entry->refcount);
- kref_get(&dat_entry->refcount);
+ kref_get(&dat_entry->refcount);
hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat,
batadv_hash_dat, dat_entry,
&dat_entry->hash_entry);
@@ -369,12 +369,11 @@ out:
* batadv_dbg_arp - print a debug message containing all the ARP packet details
* @bat_priv: the bat priv with all the soft interface information
* @skb: ARP packet
- * @type: ARP type
* @hdr_size: size of the possible header before the ARP packet
* @msg: message to print together with the debugging information
*/
static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
- u16 type, int hdr_size, char *msg)
+ int hdr_size, char *msg)
{
struct batadv_unicast_4addr_packet *unicast_4addr_packet;
struct batadv_bcast_packet *bcast_pkt;
@@ -441,7 +440,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
#else
static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
- u16 type, int hdr_size, char *msg)
+ int hdr_size, char *msg)
{
}
@@ -795,6 +794,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
batadv_dat_hash_free(bat_priv);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_dat_cache_seq_print_text - print the local DAT hash table
* @seq: seq file to print on
@@ -846,6 +846,7 @@ out:
batadv_hardif_put(primary_if);
return 0;
}
+#endif
/**
* batadv_arp_get_type - parse an ARP packet and gets the type
@@ -948,6 +949,41 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
}
/**
+ * batadv_dat_arp_create_reply - create an ARP Reply
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ip_src: ARP sender IP
+ * @ip_dst: ARP target IP
+ * @hw_src: Ethernet source and ARP sender MAC
+ * @hw_dst: Ethernet destination and ARP target MAC
+ * @vid: VLAN identifier (optional, set to zero otherwise)
+ *
+ * Creates an ARP Reply from the given values, optionally encapsulated in a
+ * VLAN header.
+ *
+ * Return: An skb containing an ARP Reply.
+ */
+static struct sk_buff *
+batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
+ __be32 ip_dst, u8 *hw_src, u8 *hw_dst,
+ unsigned short vid)
+{
+ struct sk_buff *skb;
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface,
+ ip_src, hw_dst, hw_src, hw_dst);
+ if (!skb)
+ return NULL;
+
+ skb_reset_mac_header(skb);
+
+ if (vid & BATADV_VLAN_HAS_TAG)
+ skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+ return skb;
+}
+
+/**
* batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to
* answer using DAT
* @bat_priv: the bat priv with all the soft interface information
@@ -981,8 +1017,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
if (type != ARPOP_REQUEST)
goto out;
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing outgoing ARP REQUEST");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REQUEST");
ip_src = batadv_arp_ip_src(skb, hdr_size);
hw_src = batadv_arp_hw_src(skb, hdr_size);
@@ -1005,20 +1040,12 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
goto out;
}
- skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
- bat_priv->soft_iface, ip_dst, hw_src,
- dat_entry->mac_addr, hw_src);
+ skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr,
+ hw_src, vid);
if (!skb_new)
goto out;
- if (vid & BATADV_VLAN_HAS_TAG) {
- skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
- vid & VLAN_VID_MASK);
- if (!skb_new)
- goto out;
- }
-
- skb_reset_mac_header(skb_new);
skb_new->protocol = eth_type_trans(skb_new,
bat_priv->soft_iface);
bat_priv->stats.rx_packets++;
@@ -1073,8 +1100,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
ip_src = batadv_arp_ip_src(skb, hdr_size);
ip_dst = batadv_arp_ip_dst(skb, hdr_size);
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing incoming ARP REQUEST");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REQUEST");
batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
@@ -1082,25 +1108,11 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
if (!dat_entry)
goto out;
- skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
- bat_priv->soft_iface, ip_dst, hw_src,
- dat_entry->mac_addr, hw_src);
-
+ skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr, hw_src, vid);
if (!skb_new)
goto out;
- /* the rest of the TX path assumes that the mac_header offset pointing
- * to the inner Ethernet header has been set, therefore reset it now.
- */
- skb_reset_mac_header(skb_new);
-
- if (vid & BATADV_VLAN_HAS_TAG) {
- skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
- vid & VLAN_VID_MASK);
- if (!skb_new)
- goto out;
- }
-
/* To preserve backwards compatibility, the node has choose the outgoing
* format based on the incoming request packet type. The assumption is
* that a node not using the 4addr packet format doesn't support it.
@@ -1147,8 +1159,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
if (type != ARPOP_REPLY)
return;
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing outgoing ARP REPLY");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REPLY");
hw_src = batadv_arp_hw_src(skb, hdr_size);
ip_src = batadv_arp_ip_src(skb, hdr_size);
@@ -1193,8 +1204,7 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
if (type != ARPOP_REPLY)
goto out;
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing incoming ARP REPLY");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REPLY");
hw_src = batadv_arp_hw_src(skb, hdr_size);
ip_src = batadv_arp_ip_src(skb, hdr_size);
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 0934730fb7ff..0854ebd8613e 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
#include <linux/if_ether.h>
@@ -42,17 +43,23 @@
/**
* batadv_frag_clear_chain - delete entries in the fragment buffer chain
* @head: head of chain with entries.
+ * @dropped: whether the chain is cleared because all fragments are dropped
*
* Free fragments in the passed hlist. Should be called with appropriate lock.
*/
-static void batadv_frag_clear_chain(struct hlist_head *head)
+static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped)
{
struct batadv_frag_list_entry *entry;
struct hlist_node *node;
hlist_for_each_entry_safe(entry, node, head, list) {
hlist_del(&entry->list);
- kfree_skb(entry->skb);
+
+ if (dropped)
+ kfree_skb(entry->skb);
+ else
+ consume_skb(entry->skb);
+
kfree(entry);
}
}
@@ -73,7 +80,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
spin_lock_bh(&chain->lock);
if (!check_cb || check_cb(chain)) {
- batadv_frag_clear_chain(&chain->head);
+ batadv_frag_clear_chain(&chain->fragment_list, true);
chain->size = 0;
}
@@ -117,8 +124,8 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
if (chain->seqno == seqno)
return false;
- if (!hlist_empty(&chain->head))
- batadv_frag_clear_chain(&chain->head);
+ if (!hlist_empty(&chain->fragment_list))
+ batadv_frag_clear_chain(&chain->fragment_list, true);
chain->size = 0;
chain->seqno = seqno;
@@ -176,7 +183,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
chain = &orig_node->fragments[bucket];
spin_lock_bh(&chain->lock);
if (batadv_frag_init_chain(chain, seqno)) {
- hlist_add_head(&frag_entry_new->list, &chain->head);
+ hlist_add_head(&frag_entry_new->list, &chain->fragment_list);
chain->size = skb->len - hdr_size;
chain->timestamp = jiffies;
chain->total_size = ntohs(frag_packet->total_size);
@@ -185,7 +192,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
}
/* Find the position for the new fragment. */
- hlist_for_each_entry(frag_entry_curr, &chain->head, list) {
+ hlist_for_each_entry(frag_entry_curr, &chain->fragment_list, list) {
/* Drop packet if fragment already exists. */
if (frag_entry_curr->no == frag_entry_new->no)
goto err_unlock;
@@ -220,11 +227,11 @@ out:
* exceeds the maximum size of one merged packet. Don't allow
* packets to have different total_size.
*/
- batadv_frag_clear_chain(&chain->head);
+ batadv_frag_clear_chain(&chain->fragment_list, true);
chain->size = 0;
} else if (ntohs(frag_packet->total_size) == chain->size) {
/* All fragments received. Hand over chain to caller. */
- hlist_move_list(&chain->head, chain_out);
+ hlist_move_list(&chain->fragment_list, chain_out);
chain->size = 0;
}
@@ -252,8 +259,9 @@ batadv_frag_merge_packets(struct hlist_head *chain)
{
struct batadv_frag_packet *packet;
struct batadv_frag_list_entry *entry;
- struct sk_buff *skb_out = NULL;
+ struct sk_buff *skb_out;
int size, hdr_size = sizeof(struct batadv_frag_packet);
+ bool dropped = false;
/* Remove first entry, as this is the destination for the rest of the
* fragments.
@@ -270,6 +278,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
kfree_skb(skb_out);
skb_out = NULL;
+ dropped = true;
goto free;
}
@@ -291,7 +300,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
free:
/* Locking is not needed, because 'chain' is not part of any orig. */
- batadv_frag_clear_chain(chain);
+ batadv_frag_clear_chain(chain, dropped);
return skb_out;
}
@@ -352,7 +361,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
struct batadv_orig_node *orig_node_src)
{
struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
- struct batadv_orig_node *orig_node_dst = NULL;
+ struct batadv_orig_node *orig_node_dst;
struct batadv_neigh_node *neigh_node = NULL;
struct batadv_frag_packet *packet;
u16 total_size;
@@ -433,8 +442,7 @@ err:
* @orig_node: final destination of the created fragments
* @neigh_node: next-hop of the created fragments
*
- * Return: the netdev tx status or -1 in case of error.
- * When -1 is returned the skb is not consumed.
+ * Return: the netdev tx status or a negative errno code on a failure
*/
int batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -447,7 +455,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
unsigned int header_size = sizeof(frag_header);
unsigned int max_fragment_size, max_packet_size;
- int ret = -1;
+ int ret;
/* To avoid merge and refragmentation at next-hops we never send
* fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
@@ -457,13 +465,17 @@ int batadv_frag_send_packet(struct sk_buff *skb,
max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS;
/* Don't even try to fragment, if we need more than 16 fragments */
- if (skb->len > max_packet_size)
- goto out;
+ if (skb->len > max_packet_size) {
+ ret = -EAGAIN;
+ goto free_skb;
+ }
bat_priv = orig_node->bat_priv;
primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
+ if (!primary_if) {
+ ret = -EINVAL;
+ goto free_skb;
+ }
/* Create one header to be copied to all fragments */
frag_header.packet_type = BATADV_UNICAST_FRAG;
@@ -488,34 +500,35 @@ int batadv_frag_send_packet(struct sk_buff *skb,
/* Eat and send fragments from the tail of skb */
while (skb->len > max_fragment_size) {
skb_fragment = batadv_frag_create(skb, &frag_header, mtu);
- if (!skb_fragment)
- goto out;
+ if (!skb_fragment) {
+ ret = -ENOMEM;
+ goto put_primary_if;
+ }
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb_fragment->len + ETH_HLEN);
ret = batadv_send_unicast_skb(skb_fragment, neigh_node);
if (ret != NET_XMIT_SUCCESS) {
- /* return -1 so that the caller can free the original
- * skb
- */
- ret = -1;
- goto out;
+ ret = NET_XMIT_DROP;
+ goto put_primary_if;
}
frag_header.no++;
/* The initial check in this function should cover this case */
if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) {
- ret = -1;
- goto out;
+ ret = -EINVAL;
+ goto put_primary_if;
}
}
/* Make room for the fragment header. */
if (batadv_skb_head_push(skb, header_size) < 0 ||
- pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0)
- goto out;
+ pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) {
+ ret = -ENOMEM;
+ goto put_primary_if;
+ }
memcpy(skb->data, &frag_header, header_size);
@@ -524,10 +537,13 @@ int batadv_frag_send_packet(struct sk_buff *skb,
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb->len + ETH_HLEN);
ret = batadv_send_unicast_skb(skb, neigh_node);
+ /* skb was consumed */
+ skb = NULL;
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+put_primary_if:
+ batadv_hardif_put(primary_if);
+free_skb:
+ kfree_skb(skb);
return ret;
}
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 3202fe329e63..b95f619606af 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -47,7 +47,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
static inline bool
batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
{
- if (!hlist_empty(&frags_entry->head) &&
+ if (!hlist_empty(&frags_entry->fragment_list) &&
batadv_has_timed_out(frags_entry->timestamp, BATADV_FRAG_TIMEOUT))
return true;
return false;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 63a805d3f96e..52b8bd6ec431 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
#include <linux/if_ether.h>
@@ -31,6 +32,7 @@
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
@@ -39,13 +41,17 @@
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/udp.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
#include "gateway_common.h"
#include "hard-interface.h"
#include "log.h"
+#include "netlink.h"
#include "originator.h"
#include "packet.h"
#include "routing.h"
+#include "soft-interface.h"
#include "sysfs.h"
#include "translation-table.h"
@@ -80,12 +86,12 @@ static void batadv_gw_node_release(struct kref *ref)
* batadv_gw_node_put - decrement the gw_node refcounter and possibly release it
* @gw_node: gateway node to free
*/
-static void batadv_gw_node_put(struct batadv_gw_node *gw_node)
+void batadv_gw_node_put(struct batadv_gw_node *gw_node)
{
kref_put(&gw_node->refcount, batadv_gw_node_release);
}
-static struct batadv_gw_node *
+struct batadv_gw_node *
batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
{
struct batadv_gw_node *gw_node;
@@ -164,86 +170,6 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv)
atomic_set(&bat_priv->gw.reselect, 1);
}
-static struct batadv_gw_node *
-batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
-{
- struct batadv_neigh_node *router;
- struct batadv_neigh_ifinfo *router_ifinfo;
- struct batadv_gw_node *gw_node, *curr_gw = NULL;
- u64 max_gw_factor = 0;
- u64 tmp_gw_factor = 0;
- u8 max_tq = 0;
- u8 tq_avg;
- struct batadv_orig_node *orig_node;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
- orig_node = gw_node->orig_node;
- router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
- if (!router)
- continue;
-
- router_ifinfo = batadv_neigh_ifinfo_get(router,
- BATADV_IF_DEFAULT);
- if (!router_ifinfo)
- goto next;
-
- if (!kref_get_unless_zero(&gw_node->refcount))
- goto next;
-
- tq_avg = router_ifinfo->bat_iv.tq_avg;
-
- switch (atomic_read(&bat_priv->gw.sel_class)) {
- case 1: /* fast connection */
- tmp_gw_factor = tq_avg * tq_avg;
- tmp_gw_factor *= gw_node->bandwidth_down;
- tmp_gw_factor *= 100 * 100;
- tmp_gw_factor >>= 18;
-
- if ((tmp_gw_factor > max_gw_factor) ||
- ((tmp_gw_factor == max_gw_factor) &&
- (tq_avg > max_tq))) {
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- curr_gw = gw_node;
- kref_get(&curr_gw->refcount);
- }
- break;
-
- default: /* 2: stable connection (use best statistic)
- * 3: fast-switch (use best statistic but change as
- * soon as a better gateway appears)
- * XX: late-switch (use best statistic but change as
- * soon as a better gateway appears which has
- * $routing_class more tq points)
- */
- if (tq_avg > max_tq) {
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- curr_gw = gw_node;
- kref_get(&curr_gw->refcount);
- }
- break;
- }
-
- if (tq_avg > max_tq)
- max_tq = tq_avg;
-
- if (tmp_gw_factor > max_gw_factor)
- max_gw_factor = tmp_gw_factor;
-
- batadv_gw_node_put(gw_node);
-
-next:
- batadv_neigh_node_put(router);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- }
- rcu_read_unlock();
-
- return curr_gw;
-}
-
/**
* batadv_gw_check_client_stop - check if client mode has been switched off
* @bat_priv: the bat priv with all the soft interface information
@@ -287,12 +213,19 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT)
goto out;
+ if (!bat_priv->algo_ops->gw.get_best_gw_node)
+ goto out;
+
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw)
goto out;
- next_gw = batadv_gw_get_best_gw_node(bat_priv);
+ /* if gw.reselect is set to 1 it means that a previous call to
+ * gw.is_eligible() said that we have a new best GW, therefore it can
+ * now be picked from the list and selected
+ */
+ next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv);
if (curr_gw == next_gw)
goto out;
@@ -360,70 +293,31 @@ out:
void batadv_gw_check_election(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node)
{
- struct batadv_neigh_ifinfo *router_orig_tq = NULL;
- struct batadv_neigh_ifinfo *router_gw_tq = NULL;
struct batadv_orig_node *curr_gw_orig;
- struct batadv_neigh_node *router_gw = NULL;
- struct batadv_neigh_node *router_orig = NULL;
- u8 gw_tq_avg, orig_tq_avg;
+
+ /* abort immediately if the routing algorithm does not support gateway
+ * election
+ */
+ if (!bat_priv->algo_ops->gw.is_eligible)
+ return;
curr_gw_orig = batadv_gw_get_selected_orig(bat_priv);
if (!curr_gw_orig)
goto reselect;
- router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT);
- if (!router_gw)
- goto reselect;
-
- router_gw_tq = batadv_neigh_ifinfo_get(router_gw,
- BATADV_IF_DEFAULT);
- if (!router_gw_tq)
- goto reselect;
-
/* this node already is the gateway */
if (curr_gw_orig == orig_node)
goto out;
- router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
- if (!router_orig)
+ if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig,
+ orig_node))
goto out;
- router_orig_tq = batadv_neigh_ifinfo_get(router_orig,
- BATADV_IF_DEFAULT);
- if (!router_orig_tq)
- goto out;
-
- gw_tq_avg = router_gw_tq->bat_iv.tq_avg;
- orig_tq_avg = router_orig_tq->bat_iv.tq_avg;
-
- /* the TQ value has to be better */
- if (orig_tq_avg < gw_tq_avg)
- goto out;
-
- /* if the routing class is greater than 3 the value tells us how much
- * greater the TQ value of the new gateway must be
- */
- if ((atomic_read(&bat_priv->gw.sel_class) > 3) &&
- (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class)))
- goto out;
-
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n",
- gw_tq_avg, orig_tq_avg);
-
reselect:
batadv_gw_reselect(bat_priv);
out:
if (curr_gw_orig)
batadv_orig_node_put(curr_gw_orig);
- if (router_gw)
- batadv_neigh_node_put(router_gw);
- if (router_orig)
- batadv_neigh_node_put(router_orig);
- if (router_gw_tq)
- batadv_neigh_ifinfo_put(router_gw_tq);
- if (router_orig_tq)
- batadv_neigh_ifinfo_put(router_orig_tq);
}
/**
@@ -445,15 +339,16 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
if (!gw_node)
return;
- kref_get(&orig_node->refcount);
+ kref_init(&gw_node->refcount);
INIT_HLIST_NODE(&gw_node->list);
+ kref_get(&orig_node->refcount);
gw_node->orig_node = orig_node;
gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
- kref_init(&gw_node->refcount);
spin_lock_bh(&bat_priv->gw.list_lock);
- hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list);
+ kref_get(&gw_node->refcount);
+ hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
spin_unlock_bh(&bat_priv->gw.list_lock);
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -463,6 +358,9 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
ntohl(gateway->bandwidth_down) % 10,
ntohl(gateway->bandwidth_up) / 10,
ntohl(gateway->bandwidth_up) % 10);
+
+ /* don't return reference to new gw_node */
+ batadv_gw_node_put(gw_node);
}
/**
@@ -472,14 +370,14 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
*
* Return: gateway node if found or NULL otherwise.
*/
-static struct batadv_gw_node *
-batadv_gw_node_get(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node)
+struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
{
struct batadv_gw_node *gw_node_tmp, *gw_node = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list,
+ list) {
if (gw_node_tmp->orig_node != orig_node)
continue;
@@ -578,88 +476,94 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
spin_lock_bh(&bat_priv->gw.list_lock);
hlist_for_each_entry_safe(gw_node, node_tmp,
- &bat_priv->gw.list, list) {
+ &bat_priv->gw.gateway_list, list) {
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_put(gw_node);
}
spin_unlock_bh(&bat_priv->gw.list_lock);
}
-/* fails if orig_node has no router */
-static int batadv_write_buffer_text(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- const struct batadv_gw_node *gw_node)
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
{
- struct batadv_gw_node *curr_gw;
- struct batadv_neigh_node *router;
- struct batadv_neigh_ifinfo *router_ifinfo = NULL;
- int ret = -1;
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
- router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
- if (!router)
- goto out;
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ return 0;
- router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
- if (!router_ifinfo)
- goto out;
+ seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
+ BATADV_SOURCE_VERSION, primary_if->net_dev->name,
+ primary_if->net_dev->dev_addr, net_dev->name,
+ bat_priv->algo_ops->name);
- curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ batadv_hardif_put(primary_if);
- seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n",
- (curr_gw == gw_node ? "=>" : " "),
- gw_node->orig_node->orig,
- router_ifinfo->bat_iv.tq_avg, router->addr,
- router->if_incoming->net_dev->name,
- gw_node->bandwidth_down / 10,
- gw_node->bandwidth_down % 10,
- gw_node->bandwidth_up / 10,
- gw_node->bandwidth_up % 10);
- ret = seq_has_overflowed(seq) ? -1 : 0;
+ if (!bat_priv->algo_ops->gw.print) {
+ seq_puts(seq,
+ "No printing function for this routing protocol\n");
+ return 0;
+ }
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
-out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
- return ret;
+ bat_priv->algo_ops->gw.print(bat_priv, seq);
+
+ return 0;
}
+#endif
-int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
+/**
+ * batadv_gw_dump - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ *
+ * Return: Error code, or length of message
+ */
+int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
- struct batadv_gw_node *gw_node;
- int gw_count = 0;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
+ struct batadv_hard_iface *primary_if = NULL;
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+ int ifindex;
+ int ret;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
goto out;
+ }
- seq_printf(seq,
- " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name);
+ bat_priv = netdev_priv(soft_iface);
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
- /* fails if orig_node has no router */
- if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0)
- continue;
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
- gw_count++;
+ if (!bat_priv->algo_ops->gw.dump) {
+ ret = -EOPNOTSUPP;
+ goto out;
}
- rcu_read_unlock();
- if (gw_count == 0)
- seq_puts(seq, "No gateways in range ...\n");
+ bat_priv->algo_ops->gw.dump(msg, cb, bat_priv);
+
+ ret = msg->len;
out:
if (primary_if)
batadv_hardif_put(primary_if);
- return 0;
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ return ret;
}
/**
@@ -801,7 +705,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
{
struct batadv_neigh_node *neigh_curr = NULL;
struct batadv_neigh_node *neigh_old = NULL;
- struct batadv_orig_node *orig_dst_node = NULL;
+ struct batadv_orig_node *orig_dst_node;
struct batadv_gw_node *gw_node = NULL;
struct batadv_gw_node *curr_gw = NULL;
struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo;
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 582dd8c413c8..859166d03561 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -23,6 +23,7 @@
#include <linux/types.h>
struct batadv_tvlv_gateway_data;
+struct netlink_callback;
struct seq_file;
struct sk_buff;
@@ -39,10 +40,16 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
void batadv_gw_node_delete(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node);
void batadv_gw_node_free(struct batadv_priv *bat_priv);
+void batadv_gw_node_put(struct batadv_gw_node *gw_node);
+struct batadv_gw_node *
+batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv);
int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb);
bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
enum batadv_dhcp_recipient
batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
u8 *chaddr);
+struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node);
#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index d7bc6a87bcc9..21184810d89f 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -241,10 +241,9 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
batadv_gw_node_update(bat_priv, orig, &gateway);
- /* restart gateway selection if fast or late switching was enabled */
+ /* restart gateway selection */
if ((gateway.bandwidth_down != 0) &&
- (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) &&
- (atomic_read(&bat_priv->gw.sel_class) > 2))
+ (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT))
batadv_gw_check_election(bat_priv, orig);
}
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 1f9080840566..61a431a9772b 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -35,7 +35,8 @@
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/workqueue.h>
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
#include "bat_v.h"
#include "bridge_loop_avoidance.h"
@@ -85,25 +86,55 @@ out:
}
/**
+ * batadv_getlink_net - return link net namespace (of use fallback)
+ * @netdev: net_device to check
+ * @fallback_net: return in case get_link_net is not available for @netdev
+ *
+ * Return: result of rtnl_link_ops->get_link_net or @fallback_net
+ */
+static struct net *batadv_getlink_net(const struct net_device *netdev,
+ struct net *fallback_net)
+{
+ if (!netdev->rtnl_link_ops)
+ return fallback_net;
+
+ if (!netdev->rtnl_link_ops->get_link_net)
+ return fallback_net;
+
+ return netdev->rtnl_link_ops->get_link_net(netdev);
+}
+
+/**
* batadv_mutual_parents - check if two devices are each others parent
- * @dev1: 1st net_device
- * @dev2: 2nd net_device
+ * @dev1: 1st net dev
+ * @net1: 1st devices netns
+ * @dev2: 2nd net dev
+ * @net2: 2nd devices netns
*
* veth devices come in pairs and each is the parent of the other!
*
* Return: true if the devices are each others parent, otherwise false
*/
static bool batadv_mutual_parents(const struct net_device *dev1,
- const struct net_device *dev2)
+ struct net *net1,
+ const struct net_device *dev2,
+ struct net *net2)
{
int dev1_parent_iflink = dev_get_iflink(dev1);
int dev2_parent_iflink = dev_get_iflink(dev2);
+ const struct net *dev1_parent_net;
+ const struct net *dev2_parent_net;
+
+ dev1_parent_net = batadv_getlink_net(dev1, net1);
+ dev2_parent_net = batadv_getlink_net(dev2, net2);
if (!dev1_parent_iflink || !dev2_parent_iflink)
return false;
return (dev1_parent_iflink == dev2->ifindex) &&
- (dev2_parent_iflink == dev1->ifindex);
+ (dev2_parent_iflink == dev1->ifindex) &&
+ net_eq(dev1_parent_net, net2) &&
+ net_eq(dev2_parent_net, net1);
}
/**
@@ -121,8 +152,9 @@ static bool batadv_mutual_parents(const struct net_device *dev1,
*/
static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
{
- struct net_device *parent_dev;
struct net *net = dev_net(net_dev);
+ struct net_device *parent_dev;
+ struct net *parent_net;
bool ret;
/* check if this is a batman-adv mesh interface */
@@ -134,13 +166,16 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
dev_get_iflink(net_dev) == net_dev->ifindex)
return false;
+ parent_net = batadv_getlink_net(net_dev, net);
+
/* recurse over the parent device */
- parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev));
+ parent_dev = __dev_get_by_index((struct net *)parent_net,
+ dev_get_iflink(net_dev));
/* if we got a NULL parent_dev there is something broken.. */
if (WARN(!parent_dev, "Cannot find parent device"))
return false;
- if (batadv_mutual_parents(net_dev, parent_dev))
+ if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net))
return false;
ret = batadv_is_on_batman_iface(parent_dev);
@@ -167,13 +202,77 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev)
}
/**
- * batadv_is_wifi_netdev - check if the given net_device struct is a wifi
- * interface
+ * batadv_get_real_netdevice - check if the given netdev struct is a virtual
+ * interface on top of another 'real' interface
+ * @netdev: the device to check
+ *
+ * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev()
+ * instead of this.
+ *
+ * Return: the 'real' net device or the original net device and NULL in case
+ * of an error.
+ */
+static struct net_device *batadv_get_real_netdevice(struct net_device *netdev)
+{
+ struct batadv_hard_iface *hard_iface = NULL;
+ struct net_device *real_netdev = NULL;
+ struct net *real_net;
+ struct net *net;
+ int ifindex;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return NULL;
+
+ if (netdev->ifindex == dev_get_iflink(netdev)) {
+ dev_hold(netdev);
+ return netdev;
+ }
+
+ hard_iface = batadv_hardif_get_by_netdev(netdev);
+ if (!hard_iface || !hard_iface->soft_iface)
+ goto out;
+
+ net = dev_net(hard_iface->soft_iface);
+ ifindex = dev_get_iflink(netdev);
+ real_net = batadv_getlink_net(netdev, net);
+ real_netdev = dev_get_by_index(real_net, ifindex);
+
+out:
+ if (hard_iface)
+ batadv_hardif_put(hard_iface);
+ return real_netdev;
+}
+
+/**
+ * batadv_get_real_netdev - check if the given net_device struct is a virtual
+ * interface on top of another 'real' interface
* @net_device: the device to check
*
- * Return: true if the net device is a 802.11 wireless device, false otherwise.
+ * Return: the 'real' net device or the original net device and NULL in case
+ * of an error.
*/
-bool batadv_is_wifi_netdev(struct net_device *net_device)
+struct net_device *batadv_get_real_netdev(struct net_device *net_device)
+{
+ struct net_device *real_netdev;
+
+ rtnl_lock();
+ real_netdev = batadv_get_real_netdevice(net_device);
+ rtnl_unlock();
+
+ return real_netdev;
+}
+
+/**
+ * batadv_is_wext_netdev - check if the given net_device struct is a
+ * wext wifi interface
+ * @net_device: the device to check
+ *
+ * Return: true if the net device is a wext wireless device, false
+ * otherwise.
+ */
+static bool batadv_is_wext_netdev(struct net_device *net_device)
{
if (!net_device)
return false;
@@ -186,6 +285,22 @@ bool batadv_is_wifi_netdev(struct net_device *net_device)
return true;
#endif
+ return false;
+}
+
+/**
+ * batadv_is_cfg80211_netdev - check if the given net_device struct is a
+ * cfg80211 wifi interface
+ * @net_device: the device to check
+ *
+ * Return: true if the net device is a cfg80211 wireless device, false
+ * otherwise.
+ */
+static bool batadv_is_cfg80211_netdev(struct net_device *net_device)
+{
+ if (!net_device)
+ return false;
+
/* cfg80211 drivers have to set ieee80211_ptr */
if (net_device->ieee80211_ptr)
return true;
@@ -193,6 +308,125 @@ bool batadv_is_wifi_netdev(struct net_device *net_device)
return false;
}
+/**
+ * batadv_wifi_flags_evaluate - calculate wifi flags for net_device
+ * @net_device: the device to check
+ *
+ * Return: batadv_hard_iface_wifi_flags flags of the device
+ */
+static u32 batadv_wifi_flags_evaluate(struct net_device *net_device)
+{
+ u32 wifi_flags = 0;
+ struct net_device *real_netdev;
+
+ if (batadv_is_wext_netdev(net_device))
+ wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT;
+
+ if (batadv_is_cfg80211_netdev(net_device))
+ wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
+
+ real_netdev = batadv_get_real_netdevice(net_device);
+ if (!real_netdev)
+ return wifi_flags;
+
+ if (real_netdev == net_device)
+ goto out;
+
+ if (batadv_is_wext_netdev(real_netdev))
+ wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT;
+
+ if (batadv_is_cfg80211_netdev(real_netdev))
+ wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
+
+out:
+ dev_put(real_netdev);
+ return wifi_flags;
+}
+
+/**
+ * batadv_is_cfg80211_hardif - check if the given hardif is a cfg80211 wifi
+ * interface
+ * @hard_iface: the device to check
+ *
+ * Return: true if the net device is a cfg80211 wireless device, false
+ * otherwise.
+ */
+bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface)
+{
+ u32 allowed_flags = 0;
+
+ allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
+ allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
+
+ return !!(hard_iface->wifi_flags & allowed_flags);
+}
+
+/**
+ * batadv_is_wifi_hardif - check if the given hardif is a wifi interface
+ * @hard_iface: the device to check
+ *
+ * Return: true if the net device is a 802.11 wireless device, false otherwise.
+ */
+bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface)
+{
+ if (!hard_iface)
+ return false;
+
+ return hard_iface->wifi_flags != 0;
+}
+
+/**
+ * batadv_hardif_no_broadcast - check whether (re)broadcast is necessary
+ * @if_outgoing: the outgoing interface checked and considered for (re)broadcast
+ * @orig_addr: the originator of this packet
+ * @orig_neigh: originator address of the forwarder we just got the packet from
+ * (NULL if we originated)
+ *
+ * Checks whether a packet needs to be (re)broadcasted on the given interface.
+ *
+ * Return:
+ * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface
+ * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder
+ * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator
+ * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast
+ */
+int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
+ u8 *orig_addr, u8 *orig_neigh)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct hlist_node *first;
+ int ret = BATADV_HARDIF_BCAST_OK;
+
+ rcu_read_lock();
+
+ /* 0 neighbors -> no (re)broadcast */
+ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list));
+ if (!first) {
+ ret = BATADV_HARDIF_BCAST_NORECIPIENT;
+ goto out;
+ }
+
+ /* >1 neighbors -> (re)brodcast */
+ if (rcu_dereference(hlist_next_rcu(first)))
+ goto out;
+
+ hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node,
+ list);
+
+ /* 1 neighbor, is the originator -> no rebroadcast */
+ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) {
+ ret = BATADV_HARDIF_BCAST_DUPORIG;
+ /* 1 neighbor, is the one we received from -> no rebroadcast */
+ } else if (orig_neigh &&
+ batadv_compare_eth(hardif_neigh->orig, orig_neigh)) {
+ ret = BATADV_HARDIF_BCAST_DUPFWD;
+ }
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static struct batadv_hard_iface *
batadv_hardif_get_active(const struct net_device *soft_iface)
{
@@ -625,25 +859,6 @@ out:
batadv_hardif_put(primary_if);
}
-/**
- * batadv_hardif_remove_interface_finish - cleans up the remains of a hardif
- * @work: work queue item
- *
- * Free the parts of the hard interface which can not be removed under
- * rtnl lock (to prevent deadlock situations).
- */
-static void batadv_hardif_remove_interface_finish(struct work_struct *work)
-{
- struct batadv_hard_iface *hard_iface;
-
- hard_iface = container_of(work, struct batadv_hard_iface,
- cleanup_work);
-
- batadv_debugfs_del_hardif(hard_iface);
- batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
- batadv_hardif_put(hard_iface);
-}
-
static struct batadv_hard_iface *
batadv_hardif_add_interface(struct net_device *net_dev)
{
@@ -676,22 +891,19 @@ batadv_hardif_add_interface(struct net_device *net_dev)
INIT_LIST_HEAD(&hard_iface->list);
INIT_HLIST_HEAD(&hard_iface->neigh_list);
- INIT_WORK(&hard_iface->cleanup_work,
- batadv_hardif_remove_interface_finish);
spin_lock_init(&hard_iface->neigh_list_lock);
+ kref_init(&hard_iface->refcount);
hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
- if (batadv_is_wifi_netdev(net_dev))
+ hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
+ if (batadv_is_wifi_hardif(hard_iface))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
batadv_v_hardif_init(hard_iface);
- /* extra reference for return */
- kref_init(&hard_iface->refcount);
- kref_get(&hard_iface->refcount);
-
batadv_check_known_mac_addr(hard_iface->net_dev);
+ kref_get(&hard_iface->refcount);
list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
return hard_iface;
@@ -713,13 +925,15 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
/* first deactivate interface */
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_AUTO);
+ BATADV_IF_CLEANUP_KEEP);
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
return;
hard_iface->if_status = BATADV_IF_TO_BE_REMOVED;
- queue_work(batadv_event_workqueue, &hard_iface->cleanup_work);
+ batadv_debugfs_del_hardif(hard_iface);
+ batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
+ batadv_hardif_put(hard_iface);
}
void batadv_hardif_remove_interfaces(void)
@@ -792,6 +1006,11 @@ static int batadv_hard_if_event(struct notifier_block *this,
if (hard_iface == primary_if)
batadv_primary_if_update_addr(bat_priv, NULL);
break;
+ case NETDEV_CHANGEUPPER:
+ hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
+ if (batadv_is_wifi_hardif(hard_iface))
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+ break;
default:
break;
}
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index a76724d369bf..d6309a423629 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -40,6 +40,20 @@ enum batadv_hard_if_state {
};
/**
+ * enum batadv_hard_if_bcast - broadcast avoidance options
+ * @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface
+ * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no recipient
+ * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it from
+ * @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator
+ */
+enum batadv_hard_if_bcast {
+ BATADV_HARDIF_BCAST_OK = 0,
+ BATADV_HARDIF_BCAST_NORECIPIENT,
+ BATADV_HARDIF_BCAST_DUPFWD,
+ BATADV_HARDIF_BCAST_DUPORIG,
+};
+
+/**
* enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
* @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
* @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed
@@ -51,8 +65,9 @@ enum batadv_hard_if_cleanup {
extern struct notifier_block batadv_hard_if_notifier;
-bool batadv_is_wifi_netdev(struct net_device *net_device);
-bool batadv_is_wifi_iface(int ifindex);
+struct net_device *batadv_get_real_netdev(struct net_device *net_device);
+bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface);
+bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface);
struct batadv_hard_iface*
batadv_hardif_get_by_netdev(const struct net_device *net_dev);
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
@@ -63,6 +78,8 @@ void batadv_hardif_remove_interfaces(void);
int batadv_hardif_min_mtu(struct net_device *soft_iface);
void batadv_update_min_mtu(struct net_device *soft_iface);
void batadv_hardif_release(struct kref *ref);
+int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
+ u8 *orig_addr, u8 *orig_neigh);
/**
* batadv_hardif_put - decrement the hard interface refcounter and possibly
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index cbbf87075f06..557a7044cfbc 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -61,36 +61,6 @@ void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
/* free only the hashtable and the hash itself. */
void batadv_hash_destroy(struct batadv_hashtable *hash);
-/* remove the hash structure. if hashdata_free_cb != NULL, this function will be
- * called to remove the elements inside of the hash. if you don't remove the
- * elements, memory might be leaked.
- */
-static inline void batadv_hash_delete(struct batadv_hashtable *hash,
- batadv_hashdata_free_cb free_cb,
- void *arg)
-{
- struct hlist_head *head;
- struct hlist_node *node, *node_tmp;
- spinlock_t *list_lock; /* spinlock to protect write access */
- u32 i;
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
- list_lock = &hash->list_locks[i];
-
- spin_lock_bh(list_lock);
- hlist_for_each_safe(node, node_tmp, head) {
- hlist_del_rcu(node);
-
- if (free_cb)
- free_cb(node, arg);
- }
- spin_unlock_bh(list_lock);
- }
-
- batadv_hash_destroy(hash);
-}
-
/**
* batadv_hash_add - adds data to the hashtable
* @hash: storage hash table
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 378cc1119d66..b310f381ae02 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -38,7 +38,6 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -322,8 +321,8 @@ int batadv_socket_setup(struct batadv_priv *bat_priv)
if (!bat_priv->debug_dir)
goto err;
- d = debugfs_create_file(BATADV_ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR,
- bat_priv->debug_dir, bat_priv, &batadv_fops);
+ d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
+ bat_priv, &batadv_fops);
if (!d)
goto err;
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index 618d5de06f20..e44a7da51431 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -26,9 +26,25 @@ struct batadv_icmp_header;
#define BATADV_ICMP_SOCKET "socket"
-void batadv_socket_init(void);
int batadv_socket_setup(struct batadv_priv *bat_priv);
+
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+
+void batadv_socket_init(void);
void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
size_t icmp_len);
+#else
+
+static inline void batadv_socket_init(void)
+{
+}
+
+static inline void
+batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len)
+{
+}
+
+#endif
+
#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 56dc532f7a2c..c73c31769aba 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -31,7 +31,6 @@
#include <linux/sched.h> /* for linux/wait.h */
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/uaccess.h>
@@ -212,8 +211,7 @@ int batadv_debug_log_setup(struct batadv_priv *bat_priv)
spin_lock_init(&bat_priv->debug_log->lock);
init_waitqueue_head(&bat_priv->debug_log->queue_wait);
- d = debugfs_create_file("log", S_IFREG | S_IRUSR,
- bat_priv->debug_dir, bat_priv,
+ d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
&batadv_log_fops);
if (!d)
goto err;
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index e0e1a88c3e58..3284a7b0325d 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -63,7 +63,7 @@ enum batadv_dbg_level {
BATADV_DBG_NC = BIT(5),
BATADV_DBG_MCAST = BIT(6),
BATADV_DBG_TP_METER = BIT(7),
- BATADV_DBG_ALL = 127,
+ BATADV_DBG_ALL = 255,
};
#ifdef CONFIG_BATMAN_ADV_DEBUG
@@ -71,12 +71,12 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
__printf(2, 3);
/* possibly ratelimited debug output */
-#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
- do { \
- if (atomic_read(&bat_priv->log_level) & type && \
- (!ratelimited || net_ratelimit())) \
- batadv_debug_log(bat_priv, fmt, ## arg);\
- } \
+#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
+ do { \
+ if (atomic_read(&(bat_priv)->log_level) & (type) && \
+ (!(ratelimited) || net_ratelimit())) \
+ batadv_debug_log(bat_priv, fmt, ## arg); \
+ } \
while (0)
#else /* !CONFIG_BATMAN_ADV_DEBUG */
__printf(4, 5)
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index fe4c5e29f96b..d46415edd3be 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -23,6 +23,7 @@
#include <linux/crc32c.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/genetlink.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
@@ -44,6 +45,7 @@
#include <linux/workqueue.h>
#include <net/dsfield.h>
#include <net/rtnetlink.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "bat_iv_ogm.h"
@@ -82,6 +84,12 @@ static void batadv_recv_handler_init(void);
static int __init batadv_init(void)
{
+ int ret;
+
+ ret = batadv_tt_cache_init();
+ if (ret < 0)
+ return ret;
+
INIT_LIST_HEAD(&batadv_hardif_list);
batadv_algo_init();
@@ -93,9 +101,8 @@ static int __init batadv_init(void)
batadv_tp_meter_init();
batadv_event_workqueue = create_singlethread_workqueue("bat_events");
-
if (!batadv_event_workqueue)
- return -ENOMEM;
+ goto err_create_wq;
batadv_socket_init();
batadv_debugfs_init();
@@ -108,6 +115,11 @@ static int __init batadv_init(void)
BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION);
return 0;
+
+err_create_wq:
+ batadv_tt_cache_destroy();
+
+ return -ENOMEM;
}
static void __exit batadv_exit(void)
@@ -123,6 +135,8 @@ static void __exit batadv_exit(void)
batadv_event_workqueue = NULL;
rcu_barrier();
+
+ batadv_tt_cache_destroy();
}
int batadv_mesh_init(struct net_device *soft_iface)
@@ -148,7 +162,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
- INIT_HLIST_HEAD(&bat_priv->gw.list);
+ INIT_HLIST_HEAD(&bat_priv->gw.gateway_list);
#ifdef CONFIG_BATMAN_ADV_MCAST
INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list);
INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list);
@@ -270,6 +284,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
return is_my_mac;
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_seq_print_text_primary_if_get - called from debugfs table printing
* function that requires the primary interface
@@ -305,6 +320,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq)
out:
return primary_if;
}
+#endif
/**
* batadv_max_header_len - calculate maximum encapsulation overhead for a
@@ -388,6 +404,8 @@ void batadv_skb_set_priority(struct sk_buff *skb, int offset)
static int batadv_recv_unhandled_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
+ kfree_skb(skb);
+
return NET_RX_DROP;
}
@@ -402,7 +420,6 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
struct batadv_ogm_packet *batadv_ogm_packet;
struct batadv_hard_iface *hard_iface;
u8 idx;
- int ret;
hard_iface = container_of(ptype, struct batadv_hard_iface,
batman_adv_ptype);
@@ -452,14 +469,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
/* reset control block to avoid left overs from previous users */
memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
- /* all receive handlers return whether they received or reused
- * the supplied skb. if not, we have to free the skb.
- */
idx = batadv_ogm_packet->packet_type;
- ret = (*batadv_rx_handler[idx])(skb, hard_iface);
-
- if (ret == NET_RX_DROP)
- kfree_skb(skb);
+ (*batadv_rx_handler[idx])(skb, hard_iface);
batadv_hardif_put(hard_iface);
@@ -638,3 +649,5 @@ MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
MODULE_VERSION(BATADV_SOURCE_VERSION);
+MODULE_ALIAS_RTNL_LINK("batadv");
+MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 06a860845434..a6cc8040a21d 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.3"
+#define BATADV_SOURCE_VERSION "2016.5"
#endif
/* B.A.T.M.A.N. parameters */
@@ -48,6 +48,7 @@
#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */
#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
+#define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */
#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */
/* sliding packet range of received originator messages in sequence numbers
* (should be a multiple of our word size)
@@ -185,7 +186,6 @@ enum batadv_uev_type {
#include <linux/bitops.h> /* for packet.h */
#include <linux/compiler.h>
-#include <linux/cpumask.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h> /* for packet.h */
#include <linux/if_vlan.h>
@@ -200,8 +200,8 @@ struct packet_type;
struct seq_file;
struct sk_buff;
-#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \
- (int)(vid & VLAN_VID_MASK) : -1)
+#define BATADV_PRINT_VID(vid) (((vid) & BATADV_VLAN_HAS_TAG) ? \
+ (int)((vid) & VLAN_VID_MASK) : -1)
extern struct list_head batadv_hardif_list;
@@ -284,26 +284,6 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
-/**
- * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
- * @bat_priv: the bat priv with all the soft interface information
- * @idx: index of counter to sum up
- *
- * Return: sum of all cpu-local counters
- */
-static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
-{
- u64 *counters, sum = 0;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
- sum += counters[idx];
- }
-
- return sum;
-}
-
/* Define a macro to reach the control buffer of the skb. The members of the
* control buffer are defined in struct batadv_skb_cb in types.h.
* The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index cc915073a753..090a69fc342e 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -33,6 +33,7 @@
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
@@ -48,6 +49,7 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/workqueue.h>
#include <net/addrconf.h>
#include <net/if_inet6.h>
#include <net/ip.h>
@@ -60,6 +62,18 @@
#include "translation-table.h"
#include "tvlv.h"
+static void batadv_mcast_mla_update(struct work_struct *work);
+
+/**
+ * batadv_mcast_start_timer - schedule the multicast periodic worker
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
+{
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work,
+ msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD));
+}
+
/**
* batadv_mcast_get_bridge - get the bridge on top of the softif if it exists
* @soft_iface: netdev struct of the mesh interface
@@ -231,19 +245,15 @@ out:
/**
* batadv_mcast_mla_list_free - free a list of multicast addresses
- * @bat_priv: the bat priv with all the soft interface information
* @mcast_list: the list to free
*
* Removes and frees all items in the given mcast_list.
*/
-static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
- struct hlist_head *mcast_list)
+static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
{
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- lockdep_assert_held(&bat_priv->tt.commit_lock);
-
hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
hlist_del(&mcast_entry->list);
kfree(mcast_entry);
@@ -259,6 +269,8 @@ static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
* translation table except the ones listed in the given mcast_list.
*
* If mcast_list is NULL then all are retracted.
+ *
+ * Do not call outside of the mcast worker! (or cancel mcast worker first)
*/
static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -266,7 +278,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- lockdep_assert_held(&bat_priv->tt.commit_lock);
+ WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
list) {
@@ -291,6 +303,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
*
* Adds multicast listener announcements from the given mcast_list to the
* translation table if they have not been added yet.
+ *
+ * Do not call outside of the mcast worker! (or cancel mcast worker first)
*/
static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -298,7 +312,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- lockdep_assert_held(&bat_priv->tt.commit_lock);
+ WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
if (!mcast_list)
return;
@@ -528,17 +542,22 @@ update:
}
return !(mcast_data.flags &
- (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6));
+ (BATADV_MCAST_WANT_ALL_IPV4 | BATADV_MCAST_WANT_ALL_IPV6));
}
/**
- * batadv_mcast_mla_update - update the own MLAs
+ * __batadv_mcast_mla_update - update the own MLAs
* @bat_priv: the bat priv with all the soft interface information
*
* Updates the own multicast listener announcements in the translation
* table as well as the own, announced multicast tvlv container.
+ *
+ * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list
+ * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are
+ * ensured by the non-parallel execution of the worker this function
+ * belongs to.
*/
-void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
+static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
{
struct net_device *soft_iface = bat_priv->soft_iface;
struct hlist_head mcast_list = HLIST_HEAD_INIT;
@@ -560,7 +579,30 @@ update:
batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
out:
- batadv_mcast_mla_list_free(bat_priv, &mcast_list);
+ batadv_mcast_mla_list_free(&mcast_list);
+}
+
+/**
+ * batadv_mcast_mla_update - update the own MLAs
+ * @work: kernel work struct
+ *
+ * Updates the own multicast listener announcements in the translation
+ * table as well as the own, announced multicast tvlv container.
+ *
+ * In the end, reschedules the work timer.
+ */
+static void batadv_mcast_mla_update(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_mcast *priv_mcast;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work);
+ bat_priv = container_of(priv_mcast, struct batadv_priv, mcast);
+
+ __batadv_mcast_mla_update(bat_priv);
+ batadv_mcast_start_timer(bat_priv);
}
/**
@@ -1132,8 +1174,12 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler,
NULL, BATADV_TVLV_MCAST, 2,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+
+ INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update);
+ batadv_mcast_start_timer(bat_priv);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_mcast_flags_print_header - print own mcast flags to debugfs table
* @bat_priv: the bat priv with all the soft interface information
@@ -1234,6 +1280,7 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
return 0;
}
+#endif
/**
* batadv_mcast_free - free the multicast optimizations structures
@@ -1241,12 +1288,13 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
*/
void batadv_mcast_free(struct batadv_priv *bat_priv)
{
+ cancel_delayed_work_sync(&bat_priv->mcast.work);
+
batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
- spin_lock_bh(&bat_priv->tt.commit_lock);
+ /* safely calling outside of worker, as worker was canceled above */
batadv_mcast_mla_tt_retract(bat_priv, NULL);
- spin_unlock_bh(&bat_priv->tt.commit_lock);
}
/**
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 1fb00ba84907..2cddaf52a21d 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -39,8 +39,6 @@ enum batadv_forw_mode {
#ifdef CONFIG_BATMAN_ADV_MCAST
-void batadv_mcast_mla_update(struct batadv_priv *bat_priv);
-
enum batadv_forw_mode
batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_orig_node **mcast_single_orig);
@@ -55,10 +53,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
#else
-static inline void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
-{
-}
-
static inline enum batadv_forw_mode
batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_orig_node **mcast_single_orig)
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 231f8eaf075b..062738163bdc 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -18,44 +18,51 @@
#include "netlink.h"
#include "main.h"
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/genetlink.h>
#include <linux/if_ether.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <net/genetlink.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <uapi/linux/batman_adv.h>
+#include "bat_algo.h"
+#include "bridge_loop_avoidance.h"
+#include "gateway_client.h"
#include "hard-interface.h"
+#include "originator.h"
+#include "packet.h"
#include "soft-interface.h"
#include "tp_meter.h"
+#include "translation-table.h"
-struct sk_buff;
-
-static struct genl_family batadv_netlink_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = BATADV_NL_NAME,
- .version = 1,
- .maxattr = BATADV_ATTR_MAX,
-};
+struct genl_family batadv_netlink_family;
/* multicast groups */
enum batadv_netlink_multicast_groups {
BATADV_NL_MCGRP_TPMETER,
};
-static struct genl_multicast_group batadv_netlink_mcgrps[] = {
+static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
[BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER },
};
-static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
+static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
[BATADV_ATTR_VERSION] = { .type = NLA_STRING },
[BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING },
[BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 },
@@ -69,9 +76,44 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
[BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 },
[BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 },
[BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 },
+ [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG },
+ [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 },
+ [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 },
+ [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 },
+ [BATADV_ATTR_TT_VID] = { .type = NLA_U16 },
+ [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 },
+ [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG },
+ [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 },
+ [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_TQ] = { .type = NLA_U8 },
+ [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 },
+ [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 },
+ [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 },
+ [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN },
+ [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG },
+ [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 },
+ [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN },
+ [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 },
};
/**
+ * batadv_netlink_get_ifindex - Extract an interface index from a message
+ * @nlh: Message header
+ * @attrtype: Attribute which holds an interface index
+ *
+ * Return: interface index, or 0.
+ */
+int
+batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
+{
+ struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
+
+ return attr ? nla_get_u32(attr) : 0;
+}
+
+/**
* batadv_netlink_mesh_info_put - fill in generic information about mesh
* interface
* @msg: netlink message to be sent back
@@ -93,9 +135,17 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) ||
nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) ||
nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN,
- soft_iface->dev_addr))
+ soft_iface->dev_addr) ||
+ nla_put_u8(msg, BATADV_ATTR_TT_TTVN,
+ (u8)atomic_read(&bat_priv->tt.vn)))
goto out;
+#ifdef CONFIG_BATMAN_ADV_BLA
+ if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+ ntohs(bat_priv->bla.claim_dest.group)))
+ goto out;
+#endif
+
primary_if = batadv_primary_if_get_selected(bat_priv);
if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
hard_iface = primary_if->net_dev;
@@ -380,7 +430,107 @@ out:
return ret;
}
-static struct genl_ops batadv_netlink_ops[] = {
+/**
+ * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hard_iface: Hard interface to dump
+ *
+ * Return: error code, or 0 on success
+ */
+static int
+batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct net_device *net_dev = hard_iface->net_dev;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_HARDIFS);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ net_dev->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ net_dev->name) ||
+ nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
+ net_dev->dev_addr))
+ goto nla_put_failure;
+
+ if (hard_iface->if_status == BATADV_IF_ACTIVE) {
+ if (nla_put_flag(msg, BATADV_ATTR_ACTIVE))
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_dump_hardifs - Dump all hard interface into a messages
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: error code, or length of reply message on success
+ */
+static int
+batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct batadv_hard_iface *hard_iface;
+ int ifindex;
+ int portid = NETLINK_CB(cb->skb).portid;
+ int seq = cb->nlh->nlmsg_seq;
+ int skip = cb->args[0];
+ int i = 0;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface)
+ return -ENODEV;
+
+ if (!batadv_softif_is_valid(soft_iface)) {
+ dev_put(soft_iface);
+ return -ENODEV;
+ }
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
+ continue;
+
+ if (i++ < skip)
+ continue;
+
+ if (batadv_netlink_dump_hardif_entry(msg, portid, seq,
+ hard_iface)) {
+ i--;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ dev_put(soft_iface);
+
+ cb->args[0] = i;
+
+ return msg->len;
+}
+
+static const struct genl_ops batadv_netlink_ops[] = {
{
.cmd = BATADV_CMD_GET_MESH_INFO,
.flags = GENL_ADMIN_PERM,
@@ -399,6 +549,74 @@ static struct genl_ops batadv_netlink_ops[] = {
.policy = batadv_netlink_policy,
.doit = batadv_netlink_tp_meter_cancel,
},
+ {
+ .cmd = BATADV_CMD_GET_ROUTING_ALGOS,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_algo_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_HARDIFS,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_netlink_dump_hardifs,
+ },
+ {
+ .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_tt_local_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_tt_global_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_ORIGINATORS,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_orig_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_NEIGHBORS,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_hardif_neigh_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_GATEWAYS,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_gw_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_BLA_CLAIM,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_bla_claim_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_BLA_BACKBONE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = batadv_netlink_policy,
+ .dumpit = batadv_bla_backbone_dump,
+ },
+
+};
+
+struct genl_family batadv_netlink_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = BATADV_NL_NAME,
+ .version = 1,
+ .maxattr = BATADV_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = batadv_netlink_ops,
+ .n_ops = ARRAY_SIZE(batadv_netlink_ops),
+ .mcgrps = batadv_netlink_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
};
/**
@@ -408,9 +626,7 @@ void __init batadv_netlink_register(void)
{
int ret;
- ret = genl_register_family_with_ops_groups(&batadv_netlink_family,
- batadv_netlink_ops,
- batadv_netlink_mcgrps);
+ ret = genl_register_family(&batadv_netlink_family);
if (ret)
pr_warn("unable to register netlink family");
}
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 945653ab58c6..52eb16281aba 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -21,12 +21,18 @@
#include "main.h"
#include <linux/types.h>
+#include <net/genetlink.h>
+
+struct nlmsghdr;
void batadv_netlink_register(void);
void batadv_netlink_unregister(void);
+int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype);
int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
u8 result, u32 test_time, u64 total_bytes,
u32 cookie);
+extern struct genl_family batadv_netlink_family;
+
#endif /* _NET_BATMAN_ADV_NETLINK_H_ */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 293ef4ffd4e1..ab5a3bf0765f 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -44,7 +44,6 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/workqueue.h>
@@ -261,10 +260,16 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
/**
* batadv_nc_packet_free - frees nc packet
* @nc_packet: the nc packet to free
+ * @dropped: whether the packet is freed because is is dropped
*/
-static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
+static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
+ bool dropped)
{
- kfree_skb(nc_packet->skb);
+ if (dropped)
+ kfree_skb(nc_packet->skb);
+ else
+ consume_skb(nc_packet->skb);
+
batadv_nc_path_put(nc_packet->nc_path);
kfree(nc_packet);
}
@@ -577,7 +582,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
{
batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, false);
}
/**
@@ -611,7 +616,7 @@ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
/* purge nc packet */
list_del(&nc_packet->list);
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, true);
res = true;
@@ -856,14 +861,12 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
if (!nc_node)
return NULL;
- kref_get(&orig_neigh_node->refcount);
-
/* Initialize nc_node */
INIT_LIST_HEAD(&nc_node->list);
+ kref_init(&nc_node->refcount);
ether_addr_copy(nc_node->addr, orig_node->orig);
+ kref_get(&orig_neigh_node->refcount);
nc_node->orig_node = orig_neigh_node;
- kref_init(&nc_node->refcount);
- kref_get(&nc_node->refcount);
/* Select ingoing or outgoing coding node */
if (in_coding) {
@@ -879,6 +882,7 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
/* Add nc_node to orig_node */
spin_lock_bh(lock);
+ kref_get(&nc_node->refcount);
list_add_tail_rcu(&nc_node->list, list);
spin_unlock_bh(lock);
@@ -979,7 +983,6 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
INIT_LIST_HEAD(&nc_path->packet_list);
spin_lock_init(&nc_path->packet_list_lock);
kref_init(&nc_path->refcount);
- kref_get(&nc_path->refcount);
nc_path->last_valid = jiffies;
ether_addr_copy(nc_path->next_hop, dst);
ether_addr_copy(nc_path->prev_hop, src);
@@ -989,6 +992,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
nc_path->next_hop);
/* Add nc_path to hash table */
+ kref_get(&nc_path->refcount);
hash_added = batadv_hash_add(hash, batadv_nc_hash_compare,
batadv_nc_hash_choose, &nc_path_key,
&nc_path->hash_entry);
@@ -1210,11 +1214,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
}
/* skb_src is now coded into skb_dest, so free it */
- kfree_skb(skb_src);
+ consume_skb(skb_src);
/* avoid duplicate free of skb from nc_packet */
nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, false);
/* Send the coded packet and return true */
batadv_send_unicast_skb(skb_dest, first_dest);
@@ -1401,7 +1405,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
/* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
* our ref
*/
- kfree_skb(skb);
+ consume_skb(skb);
}
/**
@@ -1725,7 +1729,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
ether_addr_copy(unicast_packet->dest, orig_dest);
unicast_packet->ttvn = ttvn;
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, false);
return unicast_packet;
}
@@ -1815,11 +1819,11 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
/* Check if network coding is enabled */
if (!atomic_read(&bat_priv->network_coding))
- return NET_RX_DROP;
+ goto free_skb;
/* Make sure we can access (and remove) header */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- return NET_RX_DROP;
+ goto free_skb;
coded_packet = (struct batadv_coded_packet *)skb->data;
ethhdr = eth_hdr(skb);
@@ -1827,7 +1831,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
/* Verify frame is destined for us */
if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) &&
!batadv_is_my_mac(bat_priv, coded_packet->second_dest))
- return NET_RX_DROP;
+ goto free_skb;
/* Update stat counter */
if (batadv_is_my_mac(bat_priv, coded_packet->second_dest))
@@ -1837,7 +1841,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
coded_packet);
if (!nc_packet) {
batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
- return NET_RX_DROP;
+ goto free_skb;
}
/* Make skb's linear, because decoding accesses the entire buffer */
@@ -1862,7 +1866,10 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
return batadv_recv_unicast_packet(skb, recv_if);
free_nc_packet:
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, true);
+free_skb:
+ kfree_skb(skb);
+
return NET_RX_DROP;
}
@@ -1882,6 +1889,7 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
batadv_hash_destroy(bat_priv->nc.decoding_hash);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_nc_nodes_seq_print_text - print the nc node information
* @seq: seq file to print on
@@ -1961,17 +1969,16 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
if (!nc_dir)
goto out;
- file = debugfs_create_u8("min_tq", S_IRUGO | S_IWUSR, nc_dir,
- &bat_priv->nc.min_tq);
+ file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
if (!file)
goto out;
- file = debugfs_create_u32("max_fwd_delay", S_IRUGO | S_IWUSR, nc_dir,
+ file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
&bat_priv->nc.max_fwd_delay);
if (!file)
goto out;
- file = debugfs_create_u32("max_buffer_time", S_IRUGO | S_IWUSR, nc_dir,
+ file = debugfs_create_u32("max_buffer_time", 0644, nc_dir,
&bat_priv->nc.max_buffer_time);
if (!file)
goto out;
@@ -1981,3 +1988,4 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
out:
return -ENOMEM;
}
+#endif
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 3940b5d24421..8f3b2969cc4e 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -28,11 +28,15 @@
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "distributed-arp-table.h"
@@ -42,8 +46,10 @@
#include "hash.h"
#include "log.h"
#include "multicast.h"
+#include "netlink.h"
#include "network-coding.h"
#include "routing.h"
+#include "soft-interface.h"
#include "translation-table.h"
/* hash class keys */
@@ -127,9 +133,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
goto out;
kref_init(&vlan->refcount);
- kref_get(&vlan->refcount);
vlan->vid = vid;
+ kref_get(&vlan->refcount);
hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list);
out:
@@ -358,7 +364,7 @@ struct batadv_orig_ifinfo *
batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo;
unsigned long reset_time;
spin_lock_bh(&orig_node->neigh_list_lock);
@@ -380,6 +386,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
orig_ifinfo->if_outgoing = if_outgoing;
INIT_HLIST_NODE(&orig_ifinfo->list);
kref_init(&orig_ifinfo->refcount);
+
kref_get(&orig_ifinfo->refcount);
hlist_add_head_rcu(&orig_ifinfo->list,
&orig_node->ifinfo_list);
@@ -453,9 +460,9 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
INIT_HLIST_NODE(&neigh_ifinfo->list);
kref_init(&neigh_ifinfo->refcount);
- kref_get(&neigh_ifinfo->refcount);
neigh_ifinfo->if_outgoing = if_outgoing;
+ kref_get(&neigh_ifinfo->refcount);
hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list);
out:
@@ -505,15 +512,17 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
* batadv_hardif_neigh_create - create a hardif neighbour node
* @hard_iface: the interface this neighbour is connected to
* @neigh_addr: the interface address of the neighbour to retrieve
+ * @orig_node: originator object representing the neighbour
*
* Return: the hardif neighbour node if found or created or NULL otherwise.
*/
static struct batadv_hardif_neigh_node *
batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr)
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_hardif_neigh_node *hardif_neigh;
spin_lock_bh(&hard_iface->neigh_list_lock);
@@ -529,6 +538,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
kref_get(&hard_iface->refcount);
INIT_HLIST_NODE(&hardif_neigh->list);
ether_addr_copy(hardif_neigh->addr, neigh_addr);
+ ether_addr_copy(hardif_neigh->orig, orig_node->orig);
hardif_neigh->if_incoming = hard_iface;
hardif_neigh->last_seen = jiffies;
@@ -537,7 +547,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
if (bat_priv->algo_ops->neigh.hardif_init)
bat_priv->algo_ops->neigh.hardif_init(hardif_neigh);
- hlist_add_head(&hardif_neigh->list, &hard_iface->neigh_list);
+ hlist_add_head_rcu(&hardif_neigh->list, &hard_iface->neigh_list);
out:
spin_unlock_bh(&hard_iface->neigh_list_lock);
@@ -549,21 +559,23 @@ out:
* node
* @hard_iface: the interface this neighbour is connected to
* @neigh_addr: the interface address of the neighbour to retrieve
+ * @orig_node: originator object representing the neighbour
*
* Return: the hardif neighbour node if found or created or NULL otherwise.
*/
static struct batadv_hardif_neigh_node *
batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr)
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node)
{
- struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_hardif_neigh_node *hardif_neigh;
/* first check without locking to avoid the overhead */
hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
if (hardif_neigh)
return hardif_neigh;
- return batadv_hardif_neigh_create(hard_iface, neigh_addr);
+ return batadv_hardif_neigh_create(hard_iface, neigh_addr, orig_node);
}
/**
@@ -623,7 +635,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node,
goto out;
hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface,
- neigh_addr);
+ neigh_addr, orig_node);
if (!hardif_neigh)
goto out;
@@ -647,8 +659,8 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node,
/* extra reference for return */
kref_init(&neigh_node->refcount);
- kref_get(&neigh_node->refcount);
+ kref_get(&neigh_node->refcount);
hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
@@ -676,7 +688,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr)
{
- struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_neigh_node *neigh_node;
/* first check without locking to avoid the overhead */
neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
@@ -686,6 +698,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_hardif_neigh_seq_print_text - print the single hop neighbour list
* @seq: neighbour table seq_file struct
@@ -719,6 +732,84 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
bat_priv->algo_ops->neigh.print(bat_priv, seq);
return 0;
}
+#endif
+
+/**
+ * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific
+ * outgoing interface
+ * @msg: message to dump into
+ * @cb: parameters for the dump
+ *
+ * Return: 0 or error value
+ */
+int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct net_device *hard_iface = NULL;
+ struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ int ret;
+ int ifindex, hard_ifindex;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_HARD_IFINDEX);
+ if (hard_ifindex) {
+ hard_iface = dev_get_by_index(net, hard_ifindex);
+ if (hard_iface)
+ hardif = batadv_hardif_get_by_netdev(hard_iface);
+
+ if (!hardif) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (hardif->soft_iface != soft_iface) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ if (!bat_priv->algo_ops->neigh.dump) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif);
+
+ ret = msg->len;
+
+ out:
+ if (hardif)
+ batadv_hardif_put(hardif);
+ if (hard_iface)
+ dev_put(hard_iface);
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ return ret;
+}
/**
* batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
@@ -905,7 +996,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
/* extra reference for return */
kref_init(&orig_node->refcount);
- kref_get(&orig_node->refcount);
orig_node->bat_priv = bat_priv;
ether_addr_copy(orig_node->orig, addr);
@@ -936,7 +1026,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
batadv_orig_node_vlan_put(vlan);
for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
- INIT_HLIST_HEAD(&orig_node->fragments[i].head);
+ INIT_HLIST_HEAD(&orig_node->fragments[i].fragment_list);
spin_lock_init(&orig_node->fragments[i].lock);
orig_node->fragments[i].size = 0;
}
@@ -1256,6 +1346,7 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
_batadv_purge_orig(bat_priv);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
{
struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1329,6 +1420,84 @@ out:
batadv_hardif_put(hard_iface);
return 0;
}
+#endif
+
+/**
+ * batadv_orig_dump - Dump to netlink the originator infos for a specific
+ * outgoing interface
+ * @msg: message to dump into
+ * @cb: parameters for the dump
+ *
+ * Return: 0 or error value
+ */
+int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct net_device *hard_iface = NULL;
+ struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ int ret;
+ int ifindex, hard_ifindex;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_HARD_IFINDEX);
+ if (hard_ifindex) {
+ hard_iface = dev_get_by_index(net, hard_ifindex);
+ if (hard_iface)
+ hardif = batadv_hardif_get_by_netdev(hard_iface);
+
+ if (!hardif) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (hardif->soft_iface != soft_iface) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ if (!bat_priv->algo_ops->orig.dump) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif);
+
+ ret = msg->len;
+
+ out:
+ if (hardif)
+ batadv_hardif_put(hardif);
+ if (hard_iface)
+ dev_put(hard_iface);
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ return ret;
+}
int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
int max_if_num)
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 566306bf05dc..ebc56183f358 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -31,7 +31,9 @@
#include "hash.h"
+struct netlink_callback;
struct seq_file;
+struct sk_buff;
bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
int batadv_originator_init(struct batadv_priv *bat_priv);
@@ -61,6 +63,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
struct batadv_hard_iface *if_outgoing);
void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
+int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb);
int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
struct batadv_orig_ifinfo *
@@ -72,6 +75,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
int max_if_num);
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6b011ff64dd8..7a36bcfa0ba0 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -21,7 +21,7 @@
#include <asm/byteorder.h>
#include <linux/types.h>
-#define batadv_tp_is_error(n) ((u8)n > 127 ? 1 : 0)
+#define batadv_tp_is_error(n) ((u8)(n) > 127 ? 1 : 0)
/**
* enum batadv_packettype - types for batman-adv encapsulated packets
@@ -129,42 +129,6 @@ enum batadv_tt_data_flags {
};
/**
- * enum batadv_tt_client_flags - TT client specific flags
- * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
- * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
- * update telling its new real location has not been received/sent yet
- * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
- * This information is used by the "AP Isolation" feature
- * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
- * information is used by the Extended Isolation feature
- * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
- * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
- * not been announced yet
- * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
- * in the table for one more originator interval for consistency purposes
- * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
- * the network but no nnode has already announced it
- *
- * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
- * Bits from 8 to 15 are called _local flags_ because they are used for local
- * computations only.
- *
- * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with
- * the other nodes in the network. To achieve this goal these flags are included
- * in the TT CRC computation.
- */
-enum batadv_tt_client_flags {
- BATADV_TT_CLIENT_DEL = BIT(0),
- BATADV_TT_CLIENT_ROAM = BIT(1),
- BATADV_TT_CLIENT_WIFI = BIT(4),
- BATADV_TT_CLIENT_ISOLA = BIT(5),
- BATADV_TT_CLIENT_NOPURGE = BIT(8),
- BATADV_TT_CLIENT_NEW = BIT(9),
- BATADV_TT_CLIENT_PENDING = BIT(10),
- BATADV_TT_CLIENT_TEMP = BIT(11),
-};
-
-/**
* enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
* @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
*/
@@ -288,16 +252,6 @@ struct batadv_elp_packet {
#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
/**
- * enum batadv_icmp_user_cmd_type - types for batman-adv icmp cmd modes
- * @BATADV_TP_START: start a throughput meter run
- * @BATADV_TP_STOP: stop a throughput meter run
- */
-enum batadv_icmp_user_cmd_type {
- BATADV_TP_START = 0,
- BATADV_TP_STOP = 2,
-};
-
-/**
* struct batadv_icmp_header - common members among all the ICMP packets
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 3d199478c405..6713bdf414cd 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -74,11 +74,23 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
if (!orig_ifinfo)
return;
- rcu_read_lock();
- curr_router = rcu_dereference(orig_ifinfo->router);
- if (curr_router && !kref_get_unless_zero(&curr_router->refcount))
- curr_router = NULL;
- rcu_read_unlock();
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ /* curr_router used earlier may not be the current orig_ifinfo->router
+ * anymore because it was dereferenced outside of the neigh_list_lock
+ * protected region. After the new best neighbor has replace the current
+ * best neighbor the reference counter needs to decrease. Consequently,
+ * the code needs to ensure the curr_router variable contains a pointer
+ * to the replaced best neighbor.
+ */
+ curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
+
+ /* increase refcount of new best neighbor */
+ if (neigh_node)
+ kref_get(&neigh_node->refcount);
+
+ rcu_assign_pointer(orig_ifinfo->router, neigh_node);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ batadv_orig_ifinfo_put(orig_ifinfo);
/* route deleted */
if ((curr_router) && (!neigh_node)) {
@@ -100,27 +112,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
curr_router->addr);
}
- if (curr_router)
- batadv_neigh_node_put(curr_router);
-
- spin_lock_bh(&orig_node->neigh_list_lock);
- /* curr_router used earlier may not be the current orig_ifinfo->router
- * anymore because it was dereferenced outside of the neigh_list_lock
- * protected region. After the new best neighbor has replace the current
- * best neighbor the reference counter needs to decrease. Consequently,
- * the code needs to ensure the curr_router variable contains a pointer
- * to the replaced best neighbor.
- */
- curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
-
- /* increase refcount of new best neighbor */
- if (neigh_node)
- kref_get(&neigh_node->refcount);
-
- rcu_assign_pointer(orig_ifinfo->router, neigh_node);
- spin_unlock_bh(&orig_node->neigh_list_lock);
- batadv_orig_ifinfo_put(orig_ifinfo);
-
/* decrease refcount of previous best neighbor */
if (curr_router)
batadv_neigh_node_put(curr_router);
@@ -205,8 +196,8 @@ bool batadv_check_management_packet(struct sk_buff *skb,
if (!is_broadcast_ether_addr(ethhdr->h_dest))
return false;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
+ /* packet with invalid sender address */
+ if (!is_valid_ether_addr(ethhdr->h_source))
return false;
/* create a copy of the skb, if needed, to modify it. */
@@ -271,11 +262,11 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
icmph->ttl = BATADV_TTL;
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res == -1)
- goto out;
-
- ret = NET_RX_SUCCESS;
+ if (res == NET_XMIT_SUCCESS)
+ ret = NET_RX_SUCCESS;
+ /* skb was consumed */
+ skb = NULL;
break;
case BATADV_TP:
if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet)))
@@ -283,6 +274,8 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
batadv_tp_meter_recv(bat_priv, skb);
ret = NET_RX_SUCCESS;
+ /* skb was consumed */
+ skb = NULL;
goto out;
default:
/* drop unknown type */
@@ -293,6 +286,9 @@ out:
batadv_hardif_put(primary_if);
if (orig_node)
batadv_orig_node_put(orig_node);
+
+ kfree_skb(skb);
+
return ret;
}
@@ -334,14 +330,20 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
icmp_packet->ttl = BATADV_TTL;
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res != -1)
- ret = NET_RX_SUCCESS;
+ if (res == NET_RX_SUCCESS)
+ ret = NET_XMIT_SUCCESS;
+
+ /* skb was consumed */
+ skb = NULL;
out:
if (primary_if)
batadv_hardif_put(primary_if);
if (orig_node)
batadv_orig_node_put(orig_node);
+
+ kfree_skb(skb);
+
return ret;
}
@@ -358,21 +360,21 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- goto out;
+ goto free_skb;
ethhdr = eth_hdr(skb);
- /* packet with unicast indication but broadcast recipient */
- if (is_broadcast_ether_addr(ethhdr->h_dest))
- goto out;
+ /* packet with unicast indication but non-unicast recipient */
+ if (!is_valid_ether_addr(ethhdr->h_dest))
+ goto free_skb;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
- goto out;
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ goto free_skb;
/* not for me */
if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
- goto out;
+ goto free_skb;
icmph = (struct batadv_icmp_header *)skb->data;
@@ -381,17 +383,17 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
icmph->msg_type == BATADV_ECHO_REQUEST) &&
(skb->len >= sizeof(struct batadv_icmp_packet_rr))) {
if (skb_linearize(skb) < 0)
- goto out;
+ goto free_skb;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, ETH_HLEN) < 0)
- goto out;
+ goto free_skb;
ethhdr = eth_hdr(skb);
icmph = (struct batadv_icmp_header *)skb->data;
icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
- goto out;
+ goto free_skb;
ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur],
ethhdr->h_dest);
@@ -409,11 +411,11 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
/* get routing information */
orig_node = batadv_orig_hash_find(bat_priv, icmph->dst);
if (!orig_node)
- goto out;
+ goto free_skb;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, ETH_HLEN) < 0)
- goto out;
+ goto put_orig_node;
icmph = (struct batadv_icmp_header *)skb->data;
@@ -422,12 +424,18 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
/* route it */
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
- if (res != -1)
+ if (res == NET_XMIT_SUCCESS)
ret = NET_RX_SUCCESS;
-out:
+ /* skb was consumed */
+ skb = NULL;
+
+put_orig_node:
if (orig_node)
batadv_orig_node_put(orig_node);
+free_skb:
+ kfree_skb(skb);
+
return ret;
}
@@ -454,12 +462,12 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
ethhdr = eth_hdr(skb);
- /* packet with unicast indication but broadcast recipient */
- if (is_broadcast_ether_addr(ethhdr->h_dest))
+ /* packet with unicast indication but non-unicast recipient */
+ if (!is_valid_ether_addr(ethhdr->h_dest))
return -EBADR;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
return -EBADR;
/* not for me */
@@ -676,18 +684,18 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
if (unicast_packet->ttl < 2) {
pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n",
ethhdr->h_source, unicast_packet->dest);
- goto out;
+ goto free_skb;
}
/* get routing information */
orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest);
if (!orig_node)
- goto out;
+ goto free_skb;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, ETH_HLEN) < 0)
- goto out;
+ goto put_orig_node;
/* decrement ttl */
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -711,8 +719,11 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
len = skb->len;
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
- if (res == -1)
- goto out;
+ if (res == NET_XMIT_SUCCESS)
+ ret = NET_RX_SUCCESS;
+
+ /* skb was consumed */
+ skb = NULL;
/* translate transmit result into receive result */
if (res == NET_XMIT_SUCCESS) {
@@ -722,11 +733,11 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
len + ETH_HLEN);
}
- ret = NET_RX_SUCCESS;
+put_orig_node:
+ batadv_orig_node_put(orig_node);
+free_skb:
+ kfree_skb(skb);
-out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
return ret;
}
@@ -911,14 +922,18 @@ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
if (check < 0)
- return NET_RX_DROP;
+ goto free_skb;
/* we don't know about this type, drop it. */
unicast_packet = (struct batadv_unicast_packet *)skb->data;
if (batadv_is_my_mac(bat_priv, unicast_packet->dest))
- return NET_RX_DROP;
+ goto free_skb;
return batadv_route_unicast_packet(skb, recv_if);
+
+free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
int batadv_recv_unicast_packet(struct sk_buff *skb,
@@ -932,6 +947,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
int check, hdr_size = sizeof(*unicast_packet);
enum batadv_subtype subtype;
bool is4addr;
+ int ret = NET_RX_DROP;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
@@ -951,9 +967,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
if (check < 0)
- return NET_RX_DROP;
+ goto free_skb;
if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
- return NET_RX_DROP;
+ goto free_skb;
/* packet for me */
if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
@@ -991,7 +1007,14 @@ rx_success:
return NET_RX_SUCCESS;
}
- return batadv_route_unicast_packet(skb, recv_if);
+ ret = batadv_route_unicast_packet(skb, recv_if);
+ /* skb was consumed */
+ skb = NULL;
+
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
}
/**
@@ -1013,15 +1036,15 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
int ret = NET_RX_DROP;
if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
- return NET_RX_DROP;
+ goto free_skb;
/* the header is likely to be modified while forwarding */
if (skb_cow(skb, hdr_size) < 0)
- return NET_RX_DROP;
+ goto free_skb;
/* packet needs to be linearized to access the tvlv content */
if (skb_linearize(skb) < 0)
- return NET_RX_DROP;
+ goto free_skb;
unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)skb->data;
@@ -1029,17 +1052,21 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
tvlv_buff_len = ntohs(unicast_tvlv_packet->tvlv_len);
if (tvlv_buff_len > skb->len - hdr_size)
- return NET_RX_DROP;
+ goto free_skb;
ret = batadv_tvlv_containers_process(bat_priv, false, NULL,
unicast_tvlv_packet->src,
unicast_tvlv_packet->dst,
tvlv_buff, tvlv_buff_len);
- if (ret != NET_RX_SUCCESS)
+ if (ret != NET_RX_SUCCESS) {
ret = batadv_route_unicast_packet(skb, recv_if);
- else
- consume_skb(skb);
+ /* skb was consumed */
+ skb = NULL;
+ }
+
+free_skb:
+ kfree_skb(skb);
return ret;
}
@@ -1065,20 +1092,22 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
if (batadv_check_unicast_packet(bat_priv, skb,
sizeof(*frag_packet)) < 0)
- goto out;
+ goto free_skb;
frag_packet = (struct batadv_frag_packet *)skb->data;
orig_node_src = batadv_orig_hash_find(bat_priv, frag_packet->orig);
if (!orig_node_src)
- goto out;
+ goto free_skb;
skb->priority = frag_packet->priority + 256;
/* Route the fragment if it is not for us and too big to be merged. */
if (!batadv_is_my_mac(bat_priv, frag_packet->dest) &&
batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) {
+ /* skb was consumed */
+ skb = NULL;
ret = NET_RX_SUCCESS;
- goto out;
+ goto put_orig_node;
}
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX);
@@ -1086,20 +1115,24 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
/* Add fragment to buffer and merge if possible. */
if (!batadv_frag_skb_buffer(&skb, orig_node_src))
- goto out;
+ goto put_orig_node;
/* Deliver merged packet to the appropriate handler, if it was
* merged
*/
- if (skb)
+ if (skb) {
batadv_batman_skb_recv(skb, recv_if->net_dev,
&recv_if->batman_adv_ptype, NULL);
+ /* skb was consumed */
+ skb = NULL;
+ }
ret = NET_RX_SUCCESS;
-out:
- if (orig_node_src)
- batadv_orig_node_put(orig_node_src);
+put_orig_node:
+ batadv_orig_node_put(orig_node_src);
+free_skb:
+ kfree_skb(skb);
return ret;
}
@@ -1118,35 +1151,35 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- goto out;
+ goto free_skb;
ethhdr = eth_hdr(skb);
/* packet with broadcast indication but unicast recipient */
if (!is_broadcast_ether_addr(ethhdr->h_dest))
- goto out;
+ goto free_skb;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
- goto out;
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ goto free_skb;
/* ignore broadcasts sent by myself */
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
- goto out;
+ goto free_skb;
bcast_packet = (struct batadv_bcast_packet *)skb->data;
/* ignore broadcasts originated by myself */
if (batadv_is_my_mac(bat_priv, bcast_packet->orig))
- goto out;
+ goto free_skb;
if (bcast_packet->ttl < 2)
- goto out;
+ goto free_skb;
orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig);
if (!orig_node)
- goto out;
+ goto free_skb;
spin_lock_bh(&orig_node->bcast_seqno_lock);
@@ -1174,18 +1207,18 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
/* check whether this has been sent by another originator before */
if (batadv_bla_check_bcast_duplist(bat_priv, skb))
- goto out;
+ goto free_skb;
batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
/* rebroadcast packet */
- batadv_add_bcast_packet_to_list(bat_priv, skb, 1);
+ batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false);
/* don't hand the broadcast up if it is from an originator
* from the same backbone.
*/
if (batadv_bla_is_backbone_gw(skb, orig_node, hdr_size))
- goto out;
+ goto free_skb;
if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, hdr_size))
goto rx_success;
@@ -1201,6 +1234,8 @@ rx_success:
spin_unlock:
spin_unlock_bh(&orig_node->bcast_seqno_lock);
+free_skb:
+ kfree_skb(skb);
out:
if (orig_node)
batadv_orig_node_put(orig_node);
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 6191159484df..49021b7124f3 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -19,6 +19,7 @@
#include "main.h"
#include <linux/atomic.h>
+#include <linux/bug.h>
#include <linux/byteorder/generic.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
@@ -64,8 +65,11 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
* If neigh_node is NULL, then the packet is broadcasted using hard_iface,
* otherwise it is sent as unicast to the given neighbor.
*
- * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb)
- * otherwise
+ * Regardless of the return value, the skb is consumed.
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
*/
int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
@@ -73,7 +77,6 @@ int batadv_send_skb_packet(struct sk_buff *skb,
{
struct batadv_priv *bat_priv;
struct ethhdr *ethhdr;
- int ret;
bat_priv = netdev_priv(hard_iface->soft_iface);
@@ -111,15 +114,8 @@ int batadv_send_skb_packet(struct sk_buff *skb,
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
* (which is > 0). This will not be treated as an error.
- *
- * a negative value cannot be returned because it could be interepreted
- * as not consumed skb by callers of batadv_send_skb_to_orig.
*/
- ret = dev_queue_xmit(skb);
- if (ret < 0)
- ret = NET_XMIT_DROP;
-
- return ret;
+ return dev_queue_xmit(skb);
send_skb_err:
kfree_skb(skb);
return NET_XMIT_DROP;
@@ -165,11 +161,9 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
* host, NULL can be passed as recv_if and no interface alternating is
* attempted.
*
- * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the
- * skb is buffered for later transmit or the NET_XMIT status returned by the
+ * Return: negative errno code on a failure, -EINPROGRESS if the skb is
+ * buffered for later transmit or the NET_XMIT status returned by the
* lower routine if the packet has been passed down.
- *
- * If the returning value is not -1 the skb has been consumed.
*/
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -177,12 +171,14 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
{
struct batadv_priv *bat_priv = orig_node->bat_priv;
struct batadv_neigh_node *neigh_node;
- int ret = -1;
+ int ret;
/* batadv_find_router() increases neigh_nodes refcount if found. */
neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
- if (!neigh_node)
- goto out;
+ if (!neigh_node) {
+ ret = -EINVAL;
+ goto free_skb;
+ }
/* Check if the skb is too large to send in one piece and fragment
* it if needed.
@@ -191,8 +187,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
skb->len > neigh_node->if_incoming->net_dev->mtu) {
/* Fragment and send packet. */
ret = batadv_frag_send_packet(skb, orig_node, neigh_node);
+ /* skb was consumed */
+ skb = NULL;
- goto out;
+ goto put_neigh_node;
}
/* try to network code the packet, if it is received on an interface
@@ -204,9 +202,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
else
ret = batadv_send_unicast_skb(skb, neigh_node);
-out:
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
+ /* skb was consumed */
+ skb = NULL;
+
+put_neigh_node:
+ batadv_neigh_node_put(neigh_node);
+free_skb:
+ kfree_skb(skb);
return ret;
}
@@ -315,8 +317,7 @@ out:
*
* Wrap the given skb into a batman-adv unicast or unicast-4addr header
* depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied
- * as packet_type. Then send this frame to the given orig_node and release a
- * reference to this orig_node.
+ * as packet_type. Then send this frame to the given orig_node.
*
* Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
@@ -328,7 +329,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
{
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr;
- int res, ret = NET_XMIT_DROP;
+ int ret = NET_XMIT_DROP;
if (!orig_node)
goto out;
@@ -365,15 +366,12 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
unicast_packet->ttvn = unicast_packet->ttvn - 1;
- res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res != -1)
- ret = NET_XMIT_SUCCESS;
+ ret = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ /* skb was consumed */
+ skb = NULL;
out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
- if (ret == NET_XMIT_DROP)
- kfree_skb(skb);
+ kfree_skb(skb);
return ret;
}
@@ -403,6 +401,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
struct batadv_orig_node *orig_node;
u8 *src, *dst;
+ int ret;
src = ethhdr->h_source;
dst = ethhdr->h_dest;
@@ -414,8 +413,13 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
}
orig_node = batadv_transtable_search(bat_priv, src, dst, vid);
- return batadv_send_skb_unicast(bat_priv, skb, packet_type,
- packet_subtype, orig_node, vid);
+ ret = batadv_send_skb_unicast(bat_priv, skb, packet_type,
+ packet_subtype, orig_node, vid);
+
+ if (orig_node)
+ batadv_orig_node_put(orig_node);
+
+ return ret;
}
/**
@@ -433,35 +437,292 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid)
{
struct batadv_orig_node *orig_node;
+ int ret;
orig_node = batadv_gw_get_selected_orig(bat_priv);
- return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR,
- BATADV_P_DATA, orig_node, vid);
+ ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR,
+ BATADV_P_DATA, orig_node, vid);
+
+ if (orig_node)
+ batadv_orig_node_put(orig_node);
+
+ return ret;
}
-void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
+/**
+ * batadv_forw_packet_free - free a forwarding packet
+ * @forw_packet: The packet to free
+ * @dropped: whether the packet is freed because is is dropped
+ *
+ * This frees a forwarding packet and releases any resources it might
+ * have claimed.
+ */
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
+ bool dropped)
{
- kfree_skb(forw_packet->skb);
+ if (dropped)
+ kfree_skb(forw_packet->skb);
+ else
+ consume_skb(forw_packet->skb);
+
if (forw_packet->if_incoming)
batadv_hardif_put(forw_packet->if_incoming);
if (forw_packet->if_outgoing)
batadv_hardif_put(forw_packet->if_outgoing);
+ if (forw_packet->queue_left)
+ atomic_inc(forw_packet->queue_left);
kfree(forw_packet);
}
+/**
+ * batadv_forw_packet_alloc - allocate a forwarding packet
+ * @if_incoming: The (optional) if_incoming to be grabbed
+ * @if_outgoing: The (optional) if_outgoing to be grabbed
+ * @queue_left: The (optional) queue counter to decrease
+ * @bat_priv: The bat_priv for the mesh of this forw_packet
+ *
+ * Allocates a forwarding packet and tries to get a reference to the
+ * (optional) if_incoming, if_outgoing and queue_left. If queue_left
+ * is NULL then bat_priv is optional, too.
+ *
+ * Return: An allocated forwarding packet on success, NULL otherwise.
+ */
+struct batadv_forw_packet *
+batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ atomic_t *queue_left,
+ struct batadv_priv *bat_priv)
+{
+ struct batadv_forw_packet *forw_packet;
+ const char *qname;
+
+ if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) {
+ qname = "unknown";
+
+ if (queue_left == &bat_priv->bcast_queue_left)
+ qname = "bcast";
+
+ if (queue_left == &bat_priv->batman_queue_left)
+ qname = "batman";
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s queue is full\n", qname);
+
+ return NULL;
+ }
+
+ forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC);
+ if (!forw_packet)
+ goto err;
+
+ if (if_incoming)
+ kref_get(&if_incoming->refcount);
+
+ if (if_outgoing)
+ kref_get(&if_outgoing->refcount);
+
+ INIT_HLIST_NODE(&forw_packet->list);
+ INIT_HLIST_NODE(&forw_packet->cleanup_list);
+ forw_packet->skb = NULL;
+ forw_packet->queue_left = queue_left;
+ forw_packet->if_incoming = if_incoming;
+ forw_packet->if_outgoing = if_outgoing;
+ forw_packet->num_packets = 0;
+
+ return forw_packet;
+
+err:
+ if (queue_left)
+ atomic_inc(queue_left);
+
+ return NULL;
+}
+
+/**
+ * batadv_forw_packet_was_stolen - check whether someone stole this packet
+ * @forw_packet: the forwarding packet to check
+ *
+ * This function checks whether the given forwarding packet was claimed by
+ * someone else for free().
+ *
+ * Return: True if someone stole it, false otherwise.
+ */
+static bool
+batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet)
+{
+ return !hlist_unhashed(&forw_packet->cleanup_list);
+}
+
+/**
+ * batadv_forw_packet_steal - claim a forw_packet for free()
+ * @forw_packet: the forwarding packet to steal
+ * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock)
+ *
+ * This function tries to steal a specific forw_packet from global
+ * visibility for the purpose of getting it for free(). That means
+ * the caller is *not* allowed to requeue it afterwards.
+ *
+ * Return: True if stealing was successful. False if someone else stole it
+ * before us.
+ */
+bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet,
+ spinlock_t *lock)
+{
+ /* did purging routine steal it earlier? */
+ spin_lock_bh(lock);
+ if (batadv_forw_packet_was_stolen(forw_packet)) {
+ spin_unlock_bh(lock);
+ return false;
+ }
+
+ hlist_del_init(&forw_packet->list);
+
+ /* Just to spot misuse of this function */
+ hlist_add_fake(&forw_packet->cleanup_list);
+
+ spin_unlock_bh(lock);
+ return true;
+}
+
+/**
+ * batadv_forw_packet_list_steal - claim a list of forward packets for free()
+ * @forw_list: the to be stolen forward packets
+ * @cleanup_list: a backup pointer, to be able to dispose the packet later
+ * @hard_iface: the interface to steal forward packets from
+ *
+ * This function claims responsibility to free any forw_packet queued on the
+ * given hard_iface. If hard_iface is NULL forwarding packets on all hard
+ * interfaces will be claimed.
+ *
+ * The packets are being moved from the forw_list to the cleanup_list and
+ * by that allows already running threads to notice the claiming.
+ */
static void
-_batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
- struct batadv_forw_packet *forw_packet,
- unsigned long send_time)
+batadv_forw_packet_list_steal(struct hlist_head *forw_list,
+ struct hlist_head *cleanup_list,
+ const struct batadv_hard_iface *hard_iface)
{
- /* add new packet to packet list */
- spin_lock_bh(&bat_priv->forw_bcast_list_lock);
- hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list);
- spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
+
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
+ forw_list, list) {
+ /* if purge_outstanding_packets() was called with an argument
+ * we delete only packets belonging to the given interface
+ */
+ if (hard_iface &&
+ (forw_packet->if_incoming != hard_iface) &&
+ (forw_packet->if_outgoing != hard_iface))
+ continue;
+
+ hlist_del(&forw_packet->list);
+ hlist_add_head(&forw_packet->cleanup_list, cleanup_list);
+ }
+}
+
+/**
+ * batadv_forw_packet_list_free - free a list of forward packets
+ * @head: a list of to be freed forw_packets
+ *
+ * This function cancels the scheduling of any packet in the provided list,
+ * waits for any possibly running packet forwarding thread to finish and
+ * finally, safely frees this forward packet.
+ *
+ * This function might sleep.
+ */
+static void batadv_forw_packet_list_free(struct hlist_head *head)
+{
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
+
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node, head,
+ cleanup_list) {
+ cancel_delayed_work_sync(&forw_packet->delayed_work);
+
+ hlist_del(&forw_packet->cleanup_list);
+ batadv_forw_packet_free(forw_packet, true);
+ }
+}
+
+/**
+ * batadv_forw_packet_queue - try to queue a forwarding packet
+ * @forw_packet: the forwarding packet to queue
+ * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock)
+ * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list)
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue a forwarding packet. Requeuing
+ * is prevented if the according interface is shutting down
+ * (e.g. if batadv_forw_packet_list_steal() was called for this
+ * packet earlier).
+ *
+ * Calling batadv_forw_packet_queue() after a call to
+ * batadv_forw_packet_steal() is forbidden!
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet,
+ spinlock_t *lock, struct hlist_head *head,
+ unsigned long send_time)
+{
+ spin_lock_bh(lock);
+
+ /* did purging routine steal it from us? */
+ if (batadv_forw_packet_was_stolen(forw_packet)) {
+ /* If you got it for free() without trouble, then
+ * don't get back into the queue after stealing...
+ */
+ WARN_ONCE(hlist_fake(&forw_packet->cleanup_list),
+ "Requeuing after batadv_forw_packet_steal() not allowed!\n");
+
+ spin_unlock_bh(lock);
+ return;
+ }
+
+ hlist_del_init(&forw_packet->list);
+ hlist_add_head(&forw_packet->list, head);
+
+ queue_delayed_work(batadv_event_workqueue,
+ &forw_packet->delayed_work,
+ send_time - jiffies);
+ spin_unlock_bh(lock);
+}
+
+/**
+ * batadv_forw_packet_bcast_queue - try to queue a broadcast packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @forw_packet: the forwarding packet to queue
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue a broadcast packet.
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+static void
+batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bcast_list_lock,
+ &bat_priv->forw_bcast_list, send_time);
+}
- /* start timer for this packet */
- queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work,
- send_time);
+/**
+ * batadv_forw_packet_ogmv1_queue - try to queue an OGMv1 packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @forw_packet: the forwarding packet to queue
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue an OGMv1 packet.
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bat_list_lock,
+ &bat_priv->forw_bat_list, send_time);
}
/**
@@ -469,6 +730,7 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @skb: broadcast packet to add
* @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
*
* add a broadcast packet to the queue and setup timers. broadcast packets
* are sent multiple times to increase probability for being received.
@@ -480,69 +742,63 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
*/
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
- unsigned long delay)
+ unsigned long delay,
+ bool own_packet)
{
- struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_hard_iface *primary_if;
struct batadv_forw_packet *forw_packet;
struct batadv_bcast_packet *bcast_packet;
struct sk_buff *newskb;
- if (!batadv_atomic_dec_not_zero(&bat_priv->bcast_queue_left)) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "bcast packet queue full\n");
- goto out;
- }
-
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
- goto out_and_inc;
-
- forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC);
+ goto err;
+ forw_packet = batadv_forw_packet_alloc(primary_if, NULL,
+ &bat_priv->bcast_queue_left,
+ bat_priv);
+ batadv_hardif_put(primary_if);
if (!forw_packet)
- goto out_and_inc;
+ goto err;
newskb = skb_copy(skb, GFP_ATOMIC);
if (!newskb)
- goto packet_free;
+ goto err_packet_free;
/* as we have a copy now, it is safe to decrease the TTL */
bcast_packet = (struct batadv_bcast_packet *)newskb->data;
bcast_packet->ttl--;
- skb_reset_mac_header(newskb);
-
forw_packet->skb = newskb;
- forw_packet->if_incoming = primary_if;
- forw_packet->if_outgoing = NULL;
-
- /* how often did we send the bcast packet ? */
- forw_packet->num_packets = 0;
+ forw_packet->own = own_packet;
INIT_DELAYED_WORK(&forw_packet->delayed_work,
batadv_send_outstanding_bcast_packet);
- _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay);
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay);
return NETDEV_TX_OK;
-packet_free:
- kfree(forw_packet);
-out_and_inc:
- atomic_inc(&bat_priv->bcast_queue_left);
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+err_packet_free:
+ batadv_forw_packet_free(forw_packet, true);
+err:
return NETDEV_TX_BUSY;
}
static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
{
struct batadv_hard_iface *hard_iface;
+ struct batadv_hardif_neigh_node *neigh_node;
struct delayed_work *delayed_work;
struct batadv_forw_packet *forw_packet;
+ struct batadv_bcast_packet *bcast_packet;
struct sk_buff *skb1;
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
+ unsigned long send_time = jiffies + msecs_to_jiffies(5);
+ bool dropped = false;
+ u8 *neigh_addr;
+ u8 *orig_neigh;
+ int ret = 0;
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
@@ -550,15 +806,17 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
soft_iface = forw_packet->if_incoming->soft_iface;
bat_priv = netdev_priv(soft_iface);
- spin_lock_bh(&bat_priv->forw_bcast_list_lock);
- hlist_del(&forw_packet->list);
- spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
-
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+ dropped = true;
goto out;
+ }
- if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet))
+ if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) {
+ dropped = true;
goto out;
+ }
+
+ bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data;
/* rebroadcast packet */
rcu_read_lock();
@@ -569,6 +827,49 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
if (forw_packet->num_packets >= hard_iface->num_bcasts)
continue;
+ if (forw_packet->own) {
+ neigh_node = NULL;
+ } else {
+ neigh_addr = eth_hdr(forw_packet->skb)->h_source;
+ neigh_node = batadv_hardif_neigh_get(hard_iface,
+ neigh_addr);
+ }
+
+ orig_neigh = neigh_node ? neigh_node->orig : NULL;
+
+ ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig,
+ orig_neigh);
+
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s surpressed: %s\n",
+ bcast_packet->orig,
+ hard_iface->net_dev->name, type);
+
+ if (neigh_node)
+ batadv_hardif_neigh_put(neigh_node);
+
+ continue;
+ }
+
+ if (neigh_node)
+ batadv_hardif_neigh_put(neigh_node);
+
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
@@ -585,23 +886,34 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
/* if we still have some more bcasts to send */
if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) {
- _batadv_add_bcast_packet_to_list(bat_priv, forw_packet,
- msecs_to_jiffies(5));
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet,
+ send_time);
return;
}
out:
- batadv_forw_packet_free(forw_packet);
- atomic_inc(&bat_priv->bcast_queue_left);
+ /* do we get something for free()? */
+ if (batadv_forw_packet_steal(forw_packet,
+ &bat_priv->forw_bcast_list_lock))
+ batadv_forw_packet_free(forw_packet, dropped);
}
+/**
+ * batadv_purge_outstanding_packets - stop/purge scheduled bcast/OGMv1 packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on
+ *
+ * This method cancels and purges any broadcast and OGMv1 packet on the given
+ * hard_iface. If hard_iface is NULL, broadcast and OGMv1 packets on all hard
+ * interfaces will be canceled and purged.
+ *
+ * This function might sleep.
+ */
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface)
{
- struct batadv_forw_packet *forw_packet;
- struct hlist_node *safe_tmp_node;
- bool pending;
+ struct hlist_head head = HLIST_HEAD_INIT;
if (hard_iface)
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -611,63 +923,18 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"purge_outstanding_packets()\n");
- /* free bcast list */
+ /* claim bcast list for free() */
spin_lock_bh(&bat_priv->forw_bcast_list_lock);
- hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
- &bat_priv->forw_bcast_list, list) {
- /* if purge_outstanding_packets() was called with an argument
- * we delete only packets belonging to the given interface
- */
- if ((hard_iface) &&
- (forw_packet->if_incoming != hard_iface) &&
- (forw_packet->if_outgoing != hard_iface))
- continue;
-
- spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
-
- /* batadv_send_outstanding_bcast_packet() will lock the list to
- * delete the item from the list
- */
- pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
- spin_lock_bh(&bat_priv->forw_bcast_list_lock);
-
- if (pending) {
- hlist_del(&forw_packet->list);
- if (!forw_packet->own)
- atomic_inc(&bat_priv->bcast_queue_left);
-
- batadv_forw_packet_free(forw_packet);
- }
- }
+ batadv_forw_packet_list_steal(&bat_priv->forw_bcast_list, &head,
+ hard_iface);
spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
- /* free batman packet list */
+ /* claim batman packet list for free() */
spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
- &bat_priv->forw_bat_list, list) {
- /* if purge_outstanding_packets() was called with an argument
- * we delete only packets belonging to the given interface
- */
- if ((hard_iface) &&
- (forw_packet->if_incoming != hard_iface) &&
- (forw_packet->if_outgoing != hard_iface))
- continue;
-
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
-
- /* send_outstanding_bat_packet() will lock the list to
- * delete the item from the list
- */
- pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
-
- if (pending) {
- hlist_del(&forw_packet->list);
- if (!forw_packet->own)
- atomic_inc(&bat_priv->batman_queue_left);
-
- batadv_forw_packet_free(forw_packet);
- }
- }
+ batadv_forw_packet_list_steal(&bat_priv->forw_bat_list, &head,
+ hard_iface);
spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* then cancel or wait for packet workers to finish and free */
+ batadv_forw_packet_list_free(&head);
}
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 7cecb7563b45..a94e1e8639ca 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -21,13 +21,25 @@
#include "main.h"
#include <linux/compiler.h>
+#include <linux/spinlock.h>
#include <linux/types.h>
#include "packet.h"
struct sk_buff;
-void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet);
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
+ bool dropped);
+struct batadv_forw_packet *
+batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ atomic_t *queue_left,
+ struct batadv_priv *bat_priv);
+bool batadv_forw_packet_steal(struct batadv_forw_packet *packet, spinlock_t *l);
+void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time);
+
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
@@ -40,7 +52,8 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
struct batadv_neigh_node *neigh_node);
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
- unsigned long delay);
+ unsigned long delay,
+ bool own_packet);
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7527c0652dd5..7b3494ae6ad9 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -22,6 +22,7 @@
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
@@ -39,6 +40,7 @@
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/socket.h>
@@ -46,7 +48,6 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/workqueue.h>
#include "bat_algo.h"
#include "bridge_loop_avoidance.h"
@@ -57,6 +58,7 @@
#include "hard-interface.h"
#include "multicast.h"
#include "network-coding.h"
+#include "originator.h"
#include "packet.h"
#include "send.h"
#include "sysfs.h"
@@ -115,6 +117,26 @@ static int batadv_interface_release(struct net_device *dev)
return 0;
}
+/**
+ * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
+ * @bat_priv: the bat priv with all the soft interface information
+ * @idx: index of counter to sum up
+ *
+ * Return: sum of all cpu-local counters
+ */
+static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
+{
+ u64 *counters, sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
+ sum += counters[idx];
+ }
+
+ return sum;
+}
+
static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
@@ -335,12 +357,12 @@ send:
seqno = atomic_inc_return(&bat_priv->bcast_seqno);
bcast_packet->seqno = htonl(seqno);
- batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay);
+ batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true);
/* a copy is stored in the bcast list, therefore removing
* the original skb.
*/
- kfree_skb(skb);
+ consume_skb(skb);
/* unicast packet */
} else {
@@ -364,7 +386,7 @@ send:
ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint,
vid);
}
- if (ret == NET_XMIT_DROP)
+ if (ret != NET_XMIT_SUCCESS)
goto dropped_freed;
}
@@ -377,6 +399,8 @@ dropped:
dropped_freed:
batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
end:
+ if (mcast_single_orig)
+ batadv_orig_node_put(mcast_single_orig);
if (primary_if)
batadv_hardif_put(primary_if);
return NETDEV_TX_OK;
@@ -591,6 +615,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
}
spin_lock_bh(&bat_priv->softif_vlan_list_lock);
+ kref_get(&vlan->refcount);
hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
@@ -601,6 +626,9 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
bat_priv->soft_iface->dev_addr, vid,
BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+ /* don't return reference to new softif_vlan */
+ batadv_softif_vlan_put(vlan);
+
return 0;
}
@@ -747,34 +775,6 @@ static void batadv_set_lockdep_class(struct net_device *dev)
}
/**
- * batadv_softif_destroy_finish - cleans up the remains of a softif
- * @work: work queue item
- *
- * Free the parts of the soft interface which can not be removed under
- * rtnl lock (to prevent deadlock situations).
- */
-static void batadv_softif_destroy_finish(struct work_struct *work)
-{
- struct batadv_softif_vlan *vlan;
- struct batadv_priv *bat_priv;
- struct net_device *soft_iface;
-
- bat_priv = container_of(work, struct batadv_priv,
- cleanup_work);
- soft_iface = bat_priv->soft_iface;
-
- /* destroy the "untagged" VLAN */
- vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
- if (vlan) {
- batadv_softif_destroy_vlan(bat_priv, vlan);
- batadv_softif_vlan_put(vlan);
- }
-
- batadv_sysfs_del_meshif(soft_iface);
- unregister_netdev(soft_iface);
-}
-
-/**
* batadv_softif_init_late - late stage initialization of soft interface
* @dev: registered network device to modify
*
@@ -791,7 +791,6 @@ static int batadv_softif_init_late(struct net_device *dev)
bat_priv = netdev_priv(dev);
bat_priv->soft_iface = dev;
- INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish);
/* batadv_interface_stats() needs to be available as soon as
* register_netdevice() has been called
@@ -1028,8 +1027,19 @@ struct net_device *batadv_softif_create(struct net *net, const char *name)
void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
{
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_softif_vlan *vlan;
+
+ ASSERT_RTNL();
+
+ /* destroy the "untagged" VLAN */
+ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (vlan) {
+ batadv_softif_destroy_vlan(bat_priv, vlan);
+ batadv_softif_vlan_put(vlan);
+ }
- queue_work(batadv_event_workqueue, &bat_priv->cleanup_work);
+ batadv_sysfs_del_meshif(soft_iface);
+ unregister_netdevice(soft_iface);
}
/**
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index fe9ca94ddee2..17c844196eb2 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -33,10 +33,10 @@
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/stringify.h>
+#include <linux/workqueue.h>
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
@@ -428,6 +428,13 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr,
struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
int bytes_written;
+ /* GW mode is not available if the routing algorithm in use does not
+ * implement the GW API
+ */
+ if (!bat_priv->algo_ops->gw.get_best_gw_node ||
+ !bat_priv->algo_ops->gw.is_eligible)
+ return -ENOENT;
+
switch (atomic_read(&bat_priv->gw.mode)) {
case BATADV_GW_MODE_CLIENT:
bytes_written = sprintf(buff, "%s\n",
@@ -455,6 +462,13 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
char *curr_gw_mode_str;
int gw_mode_tmp = -1;
+ /* toggling GW mode is allowed only if the routing algorithm in use
+ * provides the GW API
+ */
+ if (!bat_priv->algo_ops->gw.get_best_gw_node ||
+ !bat_priv->algo_ops->gw.is_eligible)
+ return -EINVAL;
+
if (buff[count - 1] == '\n')
buff[count - 1] = '\0';
@@ -514,6 +528,50 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
return count;
}
+static ssize_t batadv_show_gw_sel_class(struct kobject *kobj,
+ struct attribute *attr, char *buff)
+{
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
+
+ /* GW selection class is not available if the routing algorithm in use
+ * does not implement the GW API
+ */
+ if (!bat_priv->algo_ops->gw.get_best_gw_node ||
+ !bat_priv->algo_ops->gw.is_eligible)
+ return -ENOENT;
+
+ if (bat_priv->algo_ops->gw.show_sel_class)
+ return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff);
+
+ return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class));
+}
+
+static ssize_t batadv_store_gw_sel_class(struct kobject *kobj,
+ struct attribute *attr, char *buff,
+ size_t count)
+{
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
+
+ /* setting the GW selection class is allowed only if the routing
+ * algorithm in use implements the GW API
+ */
+ if (!bat_priv->algo_ops->gw.get_best_gw_node ||
+ !bat_priv->algo_ops->gw.is_eligible)
+ return -EINVAL;
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ if (bat_priv->algo_ops->gw.store_sel_class)
+ return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff,
+ count);
+
+ return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE,
+ batadv_post_gw_reselect, attr,
+ &bat_priv->gw.sel_class,
+ bat_priv->soft_iface);
+}
+
static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
struct attribute *attr, char *buff)
{
@@ -607,41 +665,36 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
return count;
}
-BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL);
-BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_SIF_BOOL(aggregated_ogms, 0644, NULL);
+BATADV_ATTR_SIF_BOOL(bonding, 0644, NULL);
#ifdef CONFIG_BATMAN_ADV_BLA
-BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR,
- batadv_bla_status_update);
+BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, 0644, batadv_bla_status_update);
#endif
#ifdef CONFIG_BATMAN_ADV_DAT
-BATADV_ATTR_SIF_BOOL(distributed_arp_table, S_IRUGO | S_IWUSR,
- batadv_dat_status_update);
+BATADV_ATTR_SIF_BOOL(distributed_arp_table, 0644, batadv_dat_status_update);
#endif
-BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu);
-static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL);
-static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode,
- batadv_store_gw_mode);
-BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR,
- 2 * BATADV_JITTER, INT_MAX, NULL);
-BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0,
- BATADV_TQ_MAX_VALUE, NULL);
-BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1,
- BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect);
-static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
+BATADV_ATTR_SIF_BOOL(fragmentation, 0644, batadv_update_min_mtu);
+static BATADV_ATTR(routing_algo, 0444, batadv_show_bat_algo, NULL);
+static BATADV_ATTR(gw_mode, 0644, batadv_show_gw_mode, batadv_store_gw_mode);
+BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, 0644, 2 * BATADV_JITTER,
+ INT_MAX, NULL);
+BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, 0644, 0, BATADV_TQ_MAX_VALUE,
+ NULL);
+static BATADV_ATTR(gw_sel_class, 0644, batadv_show_gw_sel_class,
+ batadv_store_gw_sel_class);
+static BATADV_ATTR(gw_bandwidth, 0644, batadv_show_gw_bwidth,
batadv_store_gw_bwidth);
#ifdef CONFIG_BATMAN_ADV_MCAST
-BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_SIF_BOOL(multicast_mode, 0644, NULL);
#endif
#ifdef CONFIG_BATMAN_ADV_DEBUG
-BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0,
- BATADV_DBG_ALL, NULL);
+BATADV_ATTR_SIF_UINT(log_level, log_level, 0644, 0, BATADV_DBG_ALL, NULL);
#endif
#ifdef CONFIG_BATMAN_ADV_NC
-BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR,
- batadv_nc_status_update);
+BATADV_ATTR_SIF_BOOL(network_coding, 0644, batadv_nc_status_update);
#endif
-static BATADV_ATTR(isolation_mark, S_IRUGO | S_IWUSR,
- batadv_show_isolation_mark, batadv_store_isolation_mark);
+static BATADV_ATTR(isolation_mark, 0644, batadv_show_isolation_mark,
+ batadv_store_isolation_mark);
static struct batadv_attribute *batadv_mesh_attrs[] = {
&batadv_attr_aggregated_ogms,
@@ -672,7 +725,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = {
NULL,
};
-BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_VLAN_BOOL(ap_isolation, 0644, NULL);
/* array of vlan specific sysfs attributes */
static struct batadv_attribute *batadv_vlan_attrs[] = {
@@ -712,6 +765,8 @@ rem_attr:
for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
+ kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
+ kobject_del(bat_priv->mesh_obj);
kobject_put(bat_priv->mesh_obj);
bat_priv->mesh_obj = NULL;
out:
@@ -726,6 +781,8 @@ void batadv_sysfs_del_meshif(struct net_device *dev)
for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
+ kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
+ kobject_del(bat_priv->mesh_obj);
kobject_put(bat_priv->mesh_obj);
bat_priv->mesh_obj = NULL;
}
@@ -781,6 +838,10 @@ rem_attr:
for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
+ if (vlan->kobj != bat_priv->mesh_obj) {
+ kobject_uevent(vlan->kobj, KOBJ_REMOVE);
+ kobject_del(vlan->kobj);
+ }
kobject_put(vlan->kobj);
vlan->kobj = NULL;
out:
@@ -800,6 +861,10 @@ void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
+ if (vlan->kobj != bat_priv->mesh_obj) {
+ kobject_uevent(vlan->kobj, KOBJ_REMOVE);
+ kobject_del(vlan->kobj);
+ }
kobject_put(vlan->kobj);
vlan->kobj = NULL;
}
@@ -828,31 +893,31 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
return length;
}
-static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
+/**
+ * batadv_store_mesh_iface_finish - store new hardif mesh_iface state
+ * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
+ * @ifname: name of soft-interface to modify
+ *
+ * Changes the parts of the hard+soft interface which can not be modified under
+ * sysfs lock (to prevent deadlock situations).
+ *
+ * Return: 0 on success, 0 < on failure
+ */
+static int batadv_store_mesh_iface_finish(struct net_device *net_dev,
+ char ifname[IFNAMSIZ])
{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
struct net *net = dev_net(net_dev);
struct batadv_hard_iface *hard_iface;
- int status_tmp = -1;
- int ret = count;
+ int status_tmp;
+ int ret = 0;
+
+ ASSERT_RTNL();
hard_iface = batadv_hardif_get_by_netdev(net_dev);
if (!hard_iface)
- return count;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (strlen(buff) >= IFNAMSIZ) {
- pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
- buff);
- batadv_hardif_put(hard_iface);
- return -EINVAL;
- }
+ return 0;
- if (strncmp(buff, "none", 4) == 0)
+ if (strncmp(ifname, "none", 4) == 0)
status_tmp = BATADV_IF_NOT_IN_USE;
else
status_tmp = BATADV_IF_I_WANT_YOU;
@@ -861,15 +926,13 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
goto out;
if ((hard_iface->soft_iface) &&
- (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0))
+ (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0))
goto out;
- rtnl_lock();
-
if (status_tmp == BATADV_IF_NOT_IN_USE) {
batadv_hardif_disable_interface(hard_iface,
BATADV_IF_CLEANUP_AUTO);
- goto unlock;
+ goto out;
}
/* if the interface already is in use */
@@ -877,15 +940,71 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
batadv_hardif_disable_interface(hard_iface,
BATADV_IF_CLEANUP_AUTO);
- ret = batadv_hardif_enable_interface(hard_iface, net, buff);
-
-unlock:
- rtnl_unlock();
+ ret = batadv_hardif_enable_interface(hard_iface, net, ifname);
out:
batadv_hardif_put(hard_iface);
return ret;
}
+/**
+ * batadv_store_mesh_iface_work - store new hardif mesh_iface state
+ * @work: work queue item
+ *
+ * Changes the parts of the hard+soft interface which can not be modified under
+ * sysfs lock (to prevent deadlock situations).
+ */
+static void batadv_store_mesh_iface_work(struct work_struct *work)
+{
+ struct batadv_store_mesh_work *store_work;
+ int ret;
+
+ store_work = container_of(work, struct batadv_store_mesh_work, work);
+
+ rtnl_lock();
+ ret = batadv_store_mesh_iface_finish(store_work->net_dev,
+ store_work->soft_iface_name);
+ rtnl_unlock();
+
+ if (ret < 0)
+ pr_err("Failed to store new mesh_iface state %s for %s: %d\n",
+ store_work->soft_iface_name, store_work->net_dev->name,
+ ret);
+
+ dev_put(store_work->net_dev);
+ kfree(store_work);
+}
+
+static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
+ struct attribute *attr, char *buff,
+ size_t count)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_store_mesh_work *store_work;
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ if (strlen(buff) >= IFNAMSIZ) {
+ pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
+ buff);
+ return -EINVAL;
+ }
+
+ store_work = kmalloc(sizeof(*store_work), GFP_KERNEL);
+ if (!store_work)
+ return -ENOMEM;
+
+ dev_hold(net_dev);
+ INIT_WORK(&store_work->work, batadv_store_mesh_iface_work);
+ store_work->net_dev = net_dev;
+ strlcpy(store_work->soft_iface_name, buff,
+ sizeof(store_work->soft_iface_name));
+
+ queue_work(batadv_event_workqueue, &store_work->work);
+
+ return count;
+}
+
static ssize_t batadv_show_iface_status(struct kobject *kobj,
struct attribute *attr, char *buff)
{
@@ -991,14 +1110,13 @@ static ssize_t batadv_show_throughput_override(struct kobject *kobj,
#endif
-static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface,
+static BATADV_ATTR(mesh_iface, 0644, batadv_show_mesh_iface,
batadv_store_mesh_iface);
-static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL);
+static BATADV_ATTR(iface_status, 0444, batadv_show_iface_status, NULL);
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
-BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR,
+BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, 0644,
2 * BATADV_JITTER, INT_MAX, NULL);
-static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR,
- batadv_show_throughput_override,
+static BATADV_ATTR(throughput_override, 0644, batadv_show_throughput_override,
batadv_store_throughput_override);
#endif
@@ -1048,6 +1166,8 @@ out:
void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
{
+ kobject_uevent(*hardif_obj, KOBJ_REMOVE);
+ kobject_del(*hardif_obj);
kobject_put(*hardif_obj);
*hardif_obj = NULL;
}
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 2333777f919d..981e8c5b07e9 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -615,9 +615,6 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
batadv_tp_fill_prerandom(tp_vars, data, data_len);
r = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (r == -1)
- kfree_skb(skb);
-
if (r == NET_XMIT_SUCCESS)
return 0;
@@ -837,6 +834,7 @@ static int batadv_tp_send(void *arg)
primary_if = batadv_primary_if_get_selected(bat_priv);
if (unlikely(!primary_if)) {
err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
goto out;
}
@@ -1206,9 +1204,6 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
/* send the ack */
r = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (r == -1)
- kfree_skb(skb);
-
if (unlikely(r < 0) || (r == NET_XMIT_DROP)) {
ret = BATADV_TP_REASON_DST_UNREACHABLE;
goto out;
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 7e6df7a4964a..30ecbfb40adf 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -22,12 +22,14 @@
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
#include <linux/compiler.h>
#include <linux/crc32c.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
#include <linux/if_ether.h>
+#include <linux/init.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
@@ -35,25 +37,38 @@
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
-#include "multicast.h"
+#include "netlink.h"
#include "originator.h"
#include "packet.h"
#include "soft-interface.h"
#include "tvlv.h"
+static struct kmem_cache *batadv_tl_cache __read_mostly;
+static struct kmem_cache *batadv_tg_cache __read_mostly;
+static struct kmem_cache *batadv_tt_orig_cache __read_mostly;
+static struct kmem_cache *batadv_tt_change_cache __read_mostly;
+static struct kmem_cache *batadv_tt_req_cache __read_mostly;
+static struct kmem_cache *batadv_tt_roam_cache __read_mostly;
+
/* hash class keys */
static struct lock_class_key batadv_tt_local_hash_lock_class_key;
static struct lock_class_key batadv_tt_global_hash_lock_class_key;
@@ -205,6 +220,20 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
}
/**
+ * batadv_tt_local_entry_free_rcu - free the tt_local_entry
+ * @rcu: rcu pointer of the tt_local_entry
+ */
+static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+
+ tt_local_entry = container_of(rcu, struct batadv_tt_local_entry,
+ common.rcu);
+
+ kmem_cache_free(batadv_tl_cache, tt_local_entry);
+}
+
+/**
* batadv_tt_local_entry_release - release tt_local_entry from lists and queue
* for free after rcu grace period
* @ref: kref pointer of the nc_node
@@ -218,7 +247,7 @@ static void batadv_tt_local_entry_release(struct kref *ref)
batadv_softif_vlan_put(tt_local_entry->vlan);
- kfree_rcu(tt_local_entry, common.rcu);
+ call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu);
}
/**
@@ -234,6 +263,20 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
}
/**
+ * batadv_tt_global_entry_free_rcu - free the tt_global_entry
+ * @rcu: rcu pointer of the tt_global_entry
+ */
+static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+
+ tt_global_entry = container_of(rcu, struct batadv_tt_global_entry,
+ common.rcu);
+
+ kmem_cache_free(batadv_tg_cache, tt_global_entry);
+}
+
+/**
* batadv_tt_global_entry_release - release tt_global_entry from lists and queue
* for free after rcu grace period
* @ref: kref pointer of the nc_node
@@ -246,7 +289,8 @@ static void batadv_tt_global_entry_release(struct kref *ref)
common.refcount);
batadv_tt_global_del_orig_list(tt_global_entry);
- kfree_rcu(tt_global_entry, common.rcu);
+
+ call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu);
}
/**
@@ -384,6 +428,19 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
}
/**
+ * batadv_tt_orig_list_entry_free_rcu - free the orig_entry
+ * @rcu: rcu pointer of the orig_entry
+ */
+static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
+
+ kmem_cache_free(batadv_tt_orig_cache, orig_entry);
+}
+
+/**
* batadv_tt_orig_list_entry_release - release tt orig entry from lists and
* queue for free after rcu grace period
* @ref: kref pointer of the tt orig entry
@@ -396,7 +453,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
refcount);
batadv_orig_node_put(orig_entry->orig_node);
- kfree_rcu(orig_entry, rcu);
+ call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu);
}
/**
@@ -426,7 +483,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
bool event_removed = false;
bool del_op_requested, del_op_entry;
- tt_change_node = kmalloc(sizeof(*tt_change_node), GFP_ATOMIC);
+ tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC);
if (!tt_change_node)
return;
@@ -467,8 +524,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
continue;
del:
list_del(&entry->list);
- kfree(entry);
- kfree(tt_change_node);
+ kmem_cache_free(batadv_tt_change_cache, entry);
+ kmem_cache_free(batadv_tt_change_cache, tt_change_node);
event_removed = true;
goto unlock;
}
@@ -589,6 +646,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
struct net *net = dev_net(soft_iface);
struct batadv_softif_vlan *vlan;
struct net_device *in_dev = NULL;
+ struct batadv_hard_iface *in_hardif = NULL;
struct hlist_head *head;
struct batadv_tt_orig_list_entry *orig_entry;
int hash_added, table_size, packet_size_max;
@@ -600,6 +658,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
if (ifindex != BATADV_NULL_IFINDEX)
in_dev = dev_get_by_index(net, ifindex);
+ if (in_dev)
+ in_hardif = batadv_hardif_get_by_netdev(in_dev);
+
tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid);
if (!is_multicast_ether_addr(addr))
@@ -646,7 +707,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
goto out;
}
- tt_local = kmalloc(sizeof(*tt_local), GFP_ATOMIC);
+ tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC);
if (!tt_local)
goto out;
@@ -656,7 +717,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
net_ratelimited_function(batadv_info, soft_iface,
"adding TT local entry %pM to non-existent VLAN %d\n",
addr, BATADV_PRINT_VID(vid));
- kfree(tt_local);
+ kmem_cache_free(batadv_tl_cache, tt_local);
tt_local = NULL;
goto out;
}
@@ -673,10 +734,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
*/
tt_local->common.flags = BATADV_TT_CLIENT_NEW;
tt_local->common.vid = vid;
- if (batadv_is_wifi_netdev(in_dev))
+ if (batadv_is_wifi_hardif(in_hardif))
tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
kref_init(&tt_local->common.refcount);
- kref_get(&tt_local->common.refcount);
tt_local->last_seen = jiffies;
tt_local->common.added_at = tt_local->last_seen;
tt_local->vlan = vlan;
@@ -688,6 +748,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
is_multicast_ether_addr(addr))
tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE;
+ kref_get(&tt_local->common.refcount);
hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt,
batadv_choose_tt, &tt_local->common,
&tt_local->common.hash_entry);
@@ -733,7 +794,7 @@ check_roaming:
*/
remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK;
- if (batadv_is_wifi_netdev(in_dev))
+ if (batadv_is_wifi_hardif(in_hardif))
tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
else
tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI;
@@ -757,6 +818,8 @@ check_roaming:
ret = true;
out:
+ if (in_hardif)
+ batadv_hardif_put(in_hardif);
if (in_dev)
dev_put(in_dev);
if (tt_local)
@@ -959,7 +1022,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
tt_diff_entries_count++;
}
list_del(&entry->list);
- kfree(entry);
+ kmem_cache_free(batadv_tt_change_cache, entry);
}
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
@@ -989,6 +1052,7 @@ container_register:
kfree(tt_data);
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
{
struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1056,6 +1120,165 @@ out:
batadv_hardif_put(primary_if);
return 0;
}
+#endif
+
+/**
+ * batadv_tt_local_dump_entry - Dump one TT local entry into a message
+ * @msg :Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @common: tt local & tt global common data
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_tt_common_entry *common)
+{
+ void *hdr;
+ struct batadv_softif_vlan *vlan;
+ struct batadv_tt_local_entry *local;
+ unsigned int last_seen_msecs;
+ u32 crc;
+
+ local = container_of(common, struct batadv_tt_local_entry, common);
+ last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen);
+
+ vlan = batadv_softif_vlan_get(bat_priv, common->vid);
+ if (!vlan)
+ return 0;
+
+ crc = vlan->tt.crc;
+
+ batadv_softif_vlan_put(vlan);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI,
+ BATADV_CMD_GET_TRANSTABLE_LOCAL);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
+ nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
+ goto nla_put_failure;
+
+ if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) &&
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @head: Pointer to the list containing the local tt entries
+ * @idx_s: Number of entries to skip
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct hlist_head *head, int *idx_s)
+{
+ struct batadv_tt_common_entry *common;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(common, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv,
+ common)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_tt_local_dump - Dump TT local entries into a message
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: Error code, or 0 on success
+ */
+int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_hashtable *hash;
+ struct hlist_head *head;
+ int ret;
+ int ifindex;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hash = bat_priv->tt.local_hash;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq,
+ bat_priv, head, &idx))
+ break;
+
+ bucket++;
+ }
+
+ ret = msg->len;
+
+ out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ return ret;
+}
static void
batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
@@ -1259,7 +1482,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
list) {
list_del(&entry->list);
- kfree(entry);
+ kmem_cache_free(batadv_tt_change_cache, entry);
}
atomic_set(&bat_priv->tt.local_changes, 0);
@@ -1341,7 +1564,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
goto out;
}
- orig_entry = kzalloc(sizeof(*orig_entry), GFP_ATOMIC);
+ orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC);
if (!orig_entry)
goto out;
@@ -1351,9 +1574,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
orig_entry->orig_node = orig_node;
orig_entry->ttvn = ttvn;
kref_init(&orig_entry->refcount);
- kref_get(&orig_entry->refcount);
spin_lock_bh(&tt_global->list_lock);
+ kref_get(&orig_entry->refcount);
hlist_add_head_rcu(&orig_entry->list,
&tt_global->orig_list);
spin_unlock_bh(&tt_global->list_lock);
@@ -1411,7 +1634,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
goto out;
if (!tt_global_entry) {
- tt_global_entry = kzalloc(sizeof(*tt_global_entry), GFP_ATOMIC);
+ tt_global_entry = kmem_cache_zalloc(batadv_tg_cache,
+ GFP_ATOMIC);
if (!tt_global_entry)
goto out;
@@ -1428,13 +1652,13 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
if (flags & BATADV_TT_CLIENT_ROAM)
tt_global_entry->roam_at = jiffies;
kref_init(&common->refcount);
- kref_get(&common->refcount);
common->added_at = jiffies;
INIT_HLIST_HEAD(&tt_global_entry->orig_list);
atomic_set(&tt_global_entry->orig_list_count, 0);
spin_lock_init(&tt_global_entry->list_lock);
+ kref_get(&common->refcount);
hash_added = batadv_hash_add(bat_priv->tt.global_hash,
batadv_compare_tt,
batadv_choose_tt, common,
@@ -1579,6 +1803,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
return best_entry;
}
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* batadv_tt_global_print_entry - print all orig nodes who announce the address
* for this global entry
@@ -1702,6 +1927,219 @@ out:
batadv_hardif_put(primary_if);
return 0;
}
+#endif
+
+/**
+ * batadv_tt_global_dump_subentry - Dump all TT local entries into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @common: tt local & tt global common data
+ * @orig: Originator node announcing a non-mesh client
+ * @best: Is the best originator for the TT entry
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_tt_common_entry *common,
+ struct batadv_tt_orig_list_entry *orig,
+ bool best)
+{
+ void *hdr;
+ struct batadv_orig_node_vlan *vlan;
+ u8 last_ttvn;
+ u32 crc;
+
+ vlan = batadv_orig_node_vlan_get(orig->orig_node,
+ common->vid);
+ if (!vlan)
+ return 0;
+
+ crc = vlan->tt.crc;
+
+ batadv_orig_node_vlan_put(vlan);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI,
+ BATADV_CMD_GET_TRANSTABLE_GLOBAL);
+ if (!hdr)
+ return -ENOBUFS;
+
+ last_ttvn = atomic_read(&orig->orig_node->last_ttvn);
+
+ if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
+ nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ orig->orig_node->orig) ||
+ nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) ||
+ nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
+ nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
+ goto nla_put_failure;
+
+ if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_tt_global_dump_entry - Dump one TT global entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @common: tt local & tt global common data
+ * @sub_s: Number of entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_tt_common_entry *common, int *sub_s)
+{
+ struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
+ struct batadv_tt_global_entry *global;
+ struct hlist_head *head;
+ int sub = 0;
+ bool best;
+
+ global = container_of(common, struct batadv_tt_global_entry, common);
+ best_entry = batadv_transtable_best_orig(bat_priv, global);
+ head = &global->orig_list;
+
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ if (sub++ < *sub_s)
+ continue;
+
+ best = (orig_entry == best_entry);
+
+ if (batadv_tt_global_dump_subentry(msg, portid, seq, common,
+ orig_entry, best)) {
+ *sub_s = sub - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ *sub_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @head: Pointer to the list containing the global tt entries
+ * @idx_s: Number of entries to skip
+ * @sub: Number of entries to skip
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct hlist_head *head, int *idx_s, int *sub)
+{
+ struct batadv_tt_common_entry *common;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(common, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv,
+ common, sub)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ *sub = 0;
+ return 0;
+}
+
+/**
+ * batadv_tt_global_dump - Dump TT global entries into a message
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: Error code, or length of message on success
+ */
+int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_hashtable *hash;
+ struct hlist_head *head;
+ int ret;
+ int ifindex;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int sub = cb->args[2];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return -EINVAL;
+
+ soft_iface = dev_get_by_index(net, ifindex);
+ if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bat_priv = netdev_priv(soft_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hash = bat_priv->tt.global_hash;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_tt_global_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq, bat_priv,
+ head, &idx, &sub))
+ break;
+
+ bucket++;
+ }
+
+ ret = msg->len;
+
+ out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+ if (soft_iface)
+ dev_put(soft_iface);
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+ cb->args[2] = sub;
+
+ return ret;
+}
/**
* _batadv_tt_global_del_orig_entry - remove and free an orig_entry
@@ -2280,7 +2718,7 @@ static void batadv_tt_req_node_release(struct kref *ref)
tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount);
- kfree(tt_req_node);
+ kmem_cache_free(batadv_tt_req_cache, tt_req_node);
}
/**
@@ -2367,7 +2805,7 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv,
goto unlock;
}
- tt_req_node = kmalloc(sizeof(*tt_req_node), GFP_ATOMIC);
+ tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC);
if (!tt_req_node)
goto unlock;
@@ -2849,7 +3287,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
&tvlv_tt_data,
&tt_change,
&tt_len);
- if (!tt_len)
+ if (!tt_len || !tvlv_len)
goto unlock;
/* Copy the last orig_node's OGM buffer */
@@ -2867,7 +3305,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
&tvlv_tt_data,
&tt_change,
&tt_len);
- if (!tt_len)
+ if (!tt_len || !tvlv_len)
goto out;
/* fill the rest of the tvlv with the real TT entries */
@@ -3104,7 +3542,7 @@ static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) {
list_del(&node->list);
- kfree(node);
+ kmem_cache_free(batadv_tt_roam_cache, node);
}
spin_unlock_bh(&bat_priv->tt.roam_list_lock);
@@ -3121,7 +3559,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
continue;
list_del(&node->list);
- kfree(node);
+ kmem_cache_free(batadv_tt_roam_cache, node);
}
spin_unlock_bh(&bat_priv->tt.roam_list_lock);
}
@@ -3162,7 +3600,8 @@ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client)
}
if (!ret) {
- tt_roam_node = kmalloc(sizeof(*tt_roam_node), GFP_ATOMIC);
+ tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache,
+ GFP_ATOMIC);
if (!tt_roam_node)
goto unlock;
@@ -3361,9 +3800,6 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
{
lockdep_assert_held(&bat_priv->tt.commit_lock);
- /* Update multicast addresses in local translation table */
- batadv_mcast_mla_update(bat_priv);
-
if (atomic_read(&bat_priv->tt.local_changes) < 1) {
if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
batadv_tt_tvlv_container_update(bat_priv);
@@ -3401,8 +3837,8 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
unsigned short vid)
{
- struct batadv_tt_local_entry *tt_local_entry = NULL;
- struct batadv_tt_global_entry *tt_global_entry = NULL;
+ struct batadv_tt_local_entry *tt_local_entry;
+ struct batadv_tt_global_entry *tt_global_entry;
struct batadv_softif_vlan *vlan;
bool ret = false;
@@ -3411,27 +3847,24 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
return false;
if (!atomic_read(&vlan->ap_isolation))
- goto out;
+ goto vlan_put;
tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid);
if (!tt_local_entry)
- goto out;
+ goto vlan_put;
tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid);
if (!tt_global_entry)
- goto out;
+ goto local_entry_put;
- if (!_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
- goto out;
-
- ret = true;
+ if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
+ ret = true;
-out:
+ batadv_tt_global_entry_put(tt_global_entry);
+local_entry_put:
+ batadv_tt_local_entry_put(tt_local_entry);
+vlan_put:
batadv_softif_vlan_put(vlan);
- if (tt_global_entry)
- batadv_tt_global_entry_put(tt_global_entry);
- if (tt_local_entry)
- batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
@@ -3865,3 +4298,85 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
return ret;
}
+
+/**
+ * batadv_tt_cache_init - Initialize tt memory object cache
+ *
+ * Return: 0 on success or negative error number in case of failure.
+ */
+int __init batadv_tt_cache_init(void)
+{
+ size_t tl_size = sizeof(struct batadv_tt_local_entry);
+ size_t tg_size = sizeof(struct batadv_tt_global_entry);
+ size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry);
+ size_t tt_change_size = sizeof(struct batadv_tt_change_node);
+ size_t tt_req_size = sizeof(struct batadv_tt_req_node);
+ size_t tt_roam_size = sizeof(struct batadv_tt_roam_node);
+
+ batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tl_cache)
+ return -ENOMEM;
+
+ batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tg_cache)
+ goto err_tt_tl_destroy;
+
+ batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache",
+ tt_orig_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_orig_cache)
+ goto err_tt_tg_destroy;
+
+ batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache",
+ tt_change_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_change_cache)
+ goto err_tt_orig_destroy;
+
+ batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache",
+ tt_req_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_req_cache)
+ goto err_tt_change_destroy;
+
+ batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache",
+ tt_roam_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_roam_cache)
+ goto err_tt_req_destroy;
+
+ return 0;
+
+err_tt_req_destroy:
+ kmem_cache_destroy(batadv_tt_req_cache);
+ batadv_tt_req_cache = NULL;
+err_tt_change_destroy:
+ kmem_cache_destroy(batadv_tt_change_cache);
+ batadv_tt_change_cache = NULL;
+err_tt_orig_destroy:
+ kmem_cache_destroy(batadv_tt_orig_cache);
+ batadv_tt_orig_cache = NULL;
+err_tt_tg_destroy:
+ kmem_cache_destroy(batadv_tg_cache);
+ batadv_tg_cache = NULL;
+err_tt_tl_destroy:
+ kmem_cache_destroy(batadv_tl_cache);
+ batadv_tl_cache = NULL;
+
+ return -ENOMEM;
+}
+
+/**
+ * batadv_tt_cache_destroy - Destroy tt memory object cache
+ */
+void batadv_tt_cache_destroy(void)
+{
+ kmem_cache_destroy(batadv_tl_cache);
+ kmem_cache_destroy(batadv_tg_cache);
+ kmem_cache_destroy(batadv_tt_orig_cache);
+ kmem_cache_destroy(batadv_tt_change_cache);
+ kmem_cache_destroy(batadv_tt_req_cache);
+ kmem_cache_destroy(batadv_tt_roam_cache);
+}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 7c7e2c006bfe..783fdba84db2 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -22,8 +22,10 @@
#include <linux/types.h>
+struct netlink_callback;
struct net_device;
struct seq_file;
+struct sk_buff;
int batadv_tt_init(struct batadv_priv *bat_priv);
bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
@@ -33,6 +35,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
const char *message, bool roaming);
int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb);
+int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb);
void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
s32 match_vid, const char *message);
@@ -59,4 +63,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
const u8 *addr, unsigned short vid);
+int batadv_tt_cache_init(void);
+void batadv_tt_cache_destroy(void);
+
#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 3d1cf0fb112d..a783420356ae 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -257,8 +257,13 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
spin_lock_bh(&bat_priv->tvlv.container_list_lock);
tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
batadv_tvlv_container_remove(bat_priv, tvlv_old);
+
+ kref_get(&tvlv_new->refcount);
hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+
+ /* don't return reference to new tvlv_container */
+ batadv_tvlv_container_put(tvlv_new);
}
/**
@@ -542,8 +547,12 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
INIT_HLIST_NODE(&tvlv_handler->list);
spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+ kref_get(&tvlv_handler->refcount);
hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+
+ /* don't return reference to new tvlv_handler */
+ batadv_tvlv_handler_put(tvlv_handler);
}
/**
@@ -591,7 +600,6 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
unsigned char *tvlv_buff;
unsigned int tvlv_len;
ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
- int res;
orig_node = batadv_orig_hash_find(bat_priv, dst);
if (!orig_node)
@@ -624,9 +632,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
tvlv_buff += sizeof(*tvlv_hdr);
memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
- res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res == -1)
- kfree_skb(skb);
+ batadv_send_skb_to_orig(skb, orig_node, NULL);
out:
batadv_orig_node_put(orig_node);
}
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index a64522c3b45d..e913aee28c98 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -28,6 +28,7 @@
#include <linux/if_ether.h>
#include <linux/kref.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/sched.h> /* for linux/wait.h */
#include <linux/spinlock.h>
#include <linux/types.h>
@@ -118,12 +119,28 @@ struct batadv_hard_iface_bat_v {
};
/**
+ * enum batadv_hard_iface_wifi_flags - Flags describing the wifi configuration
+ * of a batadv_hard_iface
+ * @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device
+ * @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device
+ * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device
+ * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi device
+ */
+enum batadv_hard_iface_wifi_flags {
+ BATADV_HARDIF_WIFI_WEXT_DIRECT = BIT(0),
+ BATADV_HARDIF_WIFI_CFG80211_DIRECT = BIT(1),
+ BATADV_HARDIF_WIFI_WEXT_INDIRECT = BIT(2),
+ BATADV_HARDIF_WIFI_CFG80211_INDIRECT = BIT(3),
+};
+
+/**
* struct batadv_hard_iface - network device known to batman-adv
* @list: list node for batadv_hardif_list
* @if_num: identificator of the interface
* @if_status: status of the interface for batman-adv
- * @net_dev: pointer to the net_device
* @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
+ * @wifi_flags: flags whether this is (directly or indirectly) a wifi interface
+ * @net_dev: pointer to the net_device
* @hardif_obj: kobject of the per interface sysfs "mesh" directory
* @refcount: number of contexts the object is used
* @batman_adv_ptype: packet type describing packets that should be processed by
@@ -132,7 +149,6 @@ struct batadv_hard_iface_bat_v {
* @rcu: struct used for freeing in an RCU-safe manner
* @bat_iv: per hard-interface B.A.T.M.A.N. IV data
* @bat_v: per hard-interface B.A.T.M.A.N. V data
- * @cleanup_work: work queue callback item for hard-interface deinit
* @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
* @neigh_list: list of unique single hop neighbors via this interface
* @neigh_list_lock: lock protecting neigh_list
@@ -141,8 +157,9 @@ struct batadv_hard_iface {
struct list_head list;
s16 if_num;
char if_status;
- struct net_device *net_dev;
u8 num_bcasts;
+ u32 wifi_flags;
+ struct net_device *net_dev;
struct kobject *hardif_obj;
struct kref refcount;
struct packet_type batman_adv_ptype;
@@ -152,7 +169,6 @@ struct batadv_hard_iface {
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
struct batadv_hard_iface_bat_v bat_v;
#endif
- struct work_struct cleanup_work;
struct dentry *debug_dir;
struct hlist_head neigh_list;
/* neigh_list_lock protects: neigh_list */
@@ -185,7 +201,7 @@ struct batadv_orig_ifinfo {
/**
* struct batadv_frag_table_entry - head in the fragment buffer table
- * @head: head of list with fragments
+ * @fragment_list: head of list with fragments
* @lock: lock to protect the list of fragments
* @timestamp: time (jiffie) of last received fragment
* @seqno: sequence number of the fragments in the list
@@ -193,8 +209,8 @@ struct batadv_orig_ifinfo {
* @total_size: expected size of the assembled packet
*/
struct batadv_frag_table_entry {
- struct hlist_head head;
- spinlock_t lock; /* protects head */
+ struct hlist_head fragment_list;
+ spinlock_t lock; /* protects fragment_list */
unsigned long timestamp;
u16 seqno;
u16 size;
@@ -409,6 +425,7 @@ struct batadv_hardif_neigh_node_bat_v {
* struct batadv_hardif_neigh_node - unique neighbor per hard-interface
* @list: list node for batadv_hard_iface::neigh_list
* @addr: the MAC address of the neighboring interface
+ * @orig: the address of the originator this neighbor node belongs to
* @if_incoming: pointer to incoming hard-interface
* @last_seen: when last packet via this neighbor was received
* @bat_v: B.A.T.M.A.N. V private data
@@ -418,6 +435,7 @@ struct batadv_hardif_neigh_node_bat_v {
struct batadv_hardif_neigh_node {
struct hlist_node list;
u8 addr[ETH_ALEN];
+ u8 orig[ETH_ALEN];
struct batadv_hard_iface *if_incoming;
unsigned long last_seen;
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
@@ -707,8 +725,8 @@ struct batadv_priv_debug_log {
/**
* struct batadv_priv_gw - per mesh interface gateway data
- * @list: list of available gateway nodes
- * @list_lock: lock protecting gw_list & curr_gw
+ * @gateway_list: list of available gateway nodes
+ * @list_lock: lock protecting gateway_list & curr_gw
* @curr_gw: pointer to currently selected gateway node
* @mode: gateway operation: off, client or server (see batadv_gw_modes)
* @sel_class: gateway selection class (applies if gw_mode client)
@@ -717,8 +735,8 @@ struct batadv_priv_debug_log {
* @reselect: bool indicating a gateway re-selection is in progress
*/
struct batadv_priv_gw {
- struct hlist_head list;
- spinlock_t list_lock; /* protects gw_list & curr_gw */
+ struct hlist_head gateway_list;
+ spinlock_t list_lock; /* protects gateway_list & curr_gw */
struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */
atomic_t mode;
atomic_t sel_class;
@@ -786,9 +804,10 @@ struct batadv_mcast_querier_state {
* @num_want_all_ipv6: counter for items in want_all_ipv6_list
* @want_lists_lock: lock for protecting modifications to mcast want lists
* (traversals are rcu-locked)
+ * @work: work queue callback item for multicast TT and TVLV updates
*/
struct batadv_priv_mcast {
- struct hlist_head mla_list;
+ struct hlist_head mla_list; /* see __batadv_mcast_mla_update() */
struct hlist_head want_all_unsnoopables_list;
struct hlist_head want_all_ipv4_list;
struct hlist_head want_all_ipv6_list;
@@ -803,6 +822,7 @@ struct batadv_priv_mcast {
atomic_t num_want_all_ipv6;
/* protects want_all_{unsnoopables,ipv4,ipv6}_list */
spinlock_t want_lists_lock;
+ struct delayed_work work;
};
#endif
@@ -1015,7 +1035,6 @@ struct batadv_priv_bat_v {
* @forw_bcast_list_lock: lock protecting forw_bcast_list
* @tp_list_lock: spinlock protecting @tp_list
* @orig_work: work queue callback item for orig node purging
- * @cleanup_work: work queue callback item for soft-interface deinit
* @primary_if: one of the hard-interfaces assigned to this mesh interface
* becomes the primary interface
* @algo_ops: routing algorithm used by this mesh interface
@@ -1074,7 +1093,6 @@ struct batadv_priv {
spinlock_t tp_list_lock; /* protects tp_list */
atomic_t tp_num;
struct delayed_work orig_work;
- struct work_struct cleanup_work;
struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */
struct batadv_algo_ops *algo_ops;
struct hlist_head softif_vlan_list;
@@ -1366,7 +1384,8 @@ struct batadv_skb_cb {
/**
* struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
- * @list: list node for batadv_socket_client::queue_list
+ * @list: list node for batadv_priv::forw_{bat,bcast}_list
+ * @cleanup_list: list node for purging functions
* @send_time: execution time for delayed_work (packet sending)
* @own: bool for locally generated packets (local OGMs are re-scheduled after
* sending)
@@ -1379,9 +1398,11 @@ struct batadv_skb_cb {
* locally generated packet
* @if_outgoing: packet where the packet should be sent to, or NULL if
* unspecified
+ * @queue_left: The queue (counter) this packet was applied to
*/
struct batadv_forw_packet {
struct hlist_node list;
+ struct hlist_node cleanup_list;
unsigned long send_time;
u8 own;
struct sk_buff *skb;
@@ -1391,11 +1412,13 @@ struct batadv_forw_packet {
struct delayed_work delayed_work;
struct batadv_hard_iface *if_incoming;
struct batadv_hard_iface *if_outgoing;
+ atomic_t *queue_left;
};
/**
* struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific)
* @activate: start routing mechanisms when hard-interface is brought up
+ * (optional)
* @enable: init routing info when hard-interface is enabled
* @disable: de-init routing info when hard-interface is disabled
* @update_mac: (re-)init mac addresses of the protocol information
@@ -1413,11 +1436,13 @@ struct batadv_algo_iface_ops {
/**
* struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific)
* @hardif_init: called on creation of single hop entry
+ * (optional)
* @cmp: compare the metrics of two neighbors for their respective outgoing
* interfaces
* @is_similar_or_better: check if neigh1 is equally similar or better than
* neigh2 for their respective outgoing interface from the metric prospective
* @print: print the single hop neighbor list (optional)
+ * @dump: dump neighbors to a netlink socket (optional)
*/
struct batadv_algo_neigh_ops {
void (*hardif_init)(struct batadv_hardif_neigh_node *neigh);
@@ -1429,26 +1454,64 @@ struct batadv_algo_neigh_ops {
struct batadv_hard_iface *if_outgoing1,
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2);
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
void (*print)(struct batadv_priv *priv, struct seq_file *seq);
+#endif
+ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *priv,
+ struct batadv_hard_iface *hard_iface);
};
/**
* struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
* @free: free the resources allocated by the routing algorithm for an orig_node
- * object
+ * object (optional)
* @add_if: ask the routing algorithm to apply the needed changes to the
- * orig_node due to a new hard-interface being added into the mesh
+ * orig_node due to a new hard-interface being added into the mesh (optional)
* @del_if: ask the routing algorithm to apply the needed changes to the
- * orig_node due to an hard-interface being removed from the mesh
+ * orig_node due to an hard-interface being removed from the mesh (optional)
* @print: print the originator table (optional)
+ * @dump: dump originators to a netlink socket (optional)
*/
struct batadv_algo_orig_ops {
void (*free)(struct batadv_orig_node *orig_node);
int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num);
int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num,
int del_if_num);
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
void (*print)(struct batadv_priv *priv, struct seq_file *seq,
struct batadv_hard_iface *hard_iface);
+#endif
+ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *priv,
+ struct batadv_hard_iface *hard_iface);
+};
+
+/**
+ * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
+ * @store_sel_class: parse and stores a new GW selection class (optional)
+ * @show_sel_class: prints the current GW selection class (optional)
+ * @get_best_gw_node: select the best GW from the list of available nodes
+ * (optional)
+ * @is_eligible: check if a newly discovered GW is a potential candidate for
+ * the election as best GW (optional)
+ * @print: print the gateway table (optional)
+ * @dump: dump gateways to a netlink socket (optional)
+ */
+struct batadv_algo_gw_ops {
+ ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
+ size_t count);
+ ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
+ struct batadv_gw_node *(*get_best_gw_node)
+ (struct batadv_priv *bat_priv);
+ bool (*is_eligible)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *curr_gw_orig,
+ struct batadv_orig_node *orig_node);
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
+ void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq);
+#endif
+ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *priv);
};
/**
@@ -1458,6 +1521,7 @@ struct batadv_algo_orig_ops {
* @iface: callbacks related to interface handling
* @neigh: callbacks related to neighbors handling
* @orig: callbacks related to originators handling
+ * @gw: callbacks related to GW mode
*/
struct batadv_algo_ops {
struct hlist_node list;
@@ -1465,6 +1529,7 @@ struct batadv_algo_ops {
struct batadv_algo_iface_ops iface;
struct batadv_algo_neigh_ops neigh;
struct batadv_algo_orig_ops orig;
+ struct batadv_algo_gw_ops gw;
};
/**
@@ -1564,4 +1629,17 @@ enum batadv_tvlv_handler_flags {
BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
};
+/**
+ * struct batadv_store_mesh_work - Work queue item to detach add/del interface
+ * from sysfs locks
+ * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
+ * @soft_iface_name: name of soft-interface to modify
+ * @work: work queue item
+ */
+struct batadv_store_mesh_work {
+ struct net_device *net_dev;
+ char soft_iface_name[IFNAMSIZ];
+ struct work_struct work;
+};
+
#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index d020299baba4..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
BT_DBG("dev %p removing %speer %p", dev,
last ? "last " : "1 ", peer);
BT_DBG("chan %p orig refcnt %d", chan,
- atomic_read(&chan->kref.refcount));
+ kref_read(&chan->kref));
l2cap_chan_put(chan);
break;
@@ -1090,7 +1090,6 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
{
struct hci_conn *hcon;
struct hci_dev *hdev;
- bdaddr_t *src = BDADDR_ANY;
int n;
n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
@@ -1101,7 +1100,8 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
if (n < 7)
return -EINVAL;
- hdev = hci_get_route(addr, src);
+ /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */
+ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC);
if (!hdev)
return -ENOENT;
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index b3ff12eb9b6d..4bfaa19a5573 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -20,5 +20,3 @@ bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
bluetooth-$(CONFIG_BT_LEDS) += leds.o
bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
-
-subdir-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
/* AMP Manager functions */
struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
{
- BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+ BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
kref_get(&mgr->kref);
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
int amp_mgr_put(struct amp_mgr *mgr)
{
- BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+ BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
return kref_put(&mgr->kref, &amp_mgr_destroy);
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 0b5f729d08d2..1aff2da9bc74 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -26,11 +26,13 @@
#include <linux/module.h>
#include <linux/debugfs.h>
+#include <linux/stringify.h>
#include <asm/ioctls.h>
#include <net/bluetooth/bluetooth.h>
#include <linux/proc_fs.h>
+#include "leds.h"
#include "selftest.h"
/* Bluetooth sockets */
@@ -712,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = {
struct dentry *bt_debugfs;
EXPORT_SYMBOL_GPL(bt_debugfs);
+#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
+ __stringify(BT_SUBSYS_REVISION)
+
static int __init bt_init(void)
{
int err;
sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
- BT_INFO("Core ver %s", BT_SUBSYS_VERSION);
+ BT_INFO("Core ver %s", VERSION);
err = bt_selftest();
if (err < 0)
@@ -726,6 +731,8 @@ static int __init bt_init(void)
bt_debugfs = debugfs_create_dir("bluetooth", NULL);
+ bt_leds_init();
+
err = bt_sysfs_init();
if (err < 0)
return err;
@@ -785,6 +792,8 @@ static void __exit bt_exit(void)
bt_sysfs_cleanup();
+ bt_leds_cleanup();
+
debugfs_remove_recursive(bt_debugfs);
}
@@ -792,7 +801,7 @@ subsys_initcall(bt_init);
module_exit(bt_exit);
MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION);
-MODULE_VERSION(BT_SUBSYS_VERSION);
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
void amp_ctrl_get(struct amp_ctrl *ctrl)
{
BT_DBG("ctrl %p orig refcnt %d", ctrl,
- atomic_read(&ctrl->kref.refcount));
+ kref_read(&ctrl->kref));
kref_get(&ctrl->kref);
}
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
int amp_ctrl_put(struct amp_ctrl *ctrl)
{
BT_DBG("ctrl %p orig refcnt %d", ctrl,
- atomic_read(&ctrl->kref.refcount));
+ kref_read(&ctrl->kref));
return kref_put(&ctrl->kref, &amp_ctrl_destroy);
}
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index f4fcb4a9d5c1..2b875edf77e1 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -211,7 +211,6 @@ static const struct net_device_ops bnep_netdev_ops = {
.ndo_set_rx_mode = bnep_net_set_mc_list,
.ndo_set_mac_address = bnep_net_set_mac_addr,
.ndo_tx_timeout = bnep_net_timeout,
- .ndo_change_mtu = eth_change_mtu,
};
@@ -222,6 +221,8 @@ void bnep_net_setup(struct net_device *dev)
dev->addr_len = ETH_ALEN;
ether_setup(dev);
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->netdev_ops = &bnep_netdev_ops;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 3809617aa98d..dc59eae54717 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -613,7 +613,7 @@ int hci_conn_del(struct hci_conn *conn)
return 0;
}
-struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
+struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
{
int use_src = bacmp(src, BDADDR_ANY);
struct hci_dev *hdev = NULL, *d;
@@ -634,7 +634,29 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
*/
if (use_src) {
- if (!bacmp(&d->bdaddr, src)) {
+ bdaddr_t id_addr;
+ u8 id_addr_type;
+
+ if (src_type == BDADDR_BREDR) {
+ if (!lmp_bredr_capable(d))
+ continue;
+ bacpy(&id_addr, &d->bdaddr);
+ id_addr_type = BDADDR_BREDR;
+ } else {
+ if (!lmp_le_capable(d))
+ continue;
+
+ hci_copy_identity_address(d, &id_addr,
+ &id_addr_type);
+
+ /* Convert from HCI to three-value type */
+ if (id_addr_type == ADDR_LE_DEV_PUBLIC)
+ id_addr_type = BDADDR_LE_PUBLIC;
+ else
+ id_addr_type = BDADDR_LE_RANDOM;
+ }
+
+ if (!bacmp(&id_addr, src) && id_addr_type == src_type) {
hdev = d; break;
}
} else {
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ddf8432fe8fb..3ac89e9ace71 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
hci_dev_test_flag(hdev, HCI_MGMT))
__mgmt_power_off(hdev);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index b0e23dfc5c34..1015d9c8d97d 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,8 +21,6 @@
SOFTWARE IS DISCLAIMED.
*/
-#include <asm/unaligned.h>
-
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/mgmt.h>
@@ -971,48 +969,85 @@ void __hci_req_enable_advertising(struct hci_request *req)
hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
}
-static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
{
- u8 ad_len = 0;
- size_t name_len;
+ size_t short_len;
+ size_t complete_len;
- name_len = strlen(hdev->dev_name);
- if (name_len > 0) {
- size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
+ /* no space left for name (+ NULL + type + len) */
+ if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3)
+ return ad_len;
- if (name_len > max_len) {
- name_len = max_len;
- ptr[1] = EIR_NAME_SHORT;
- } else
- ptr[1] = EIR_NAME_COMPLETE;
+ /* use complete name if present and fits */
+ complete_len = strlen(hdev->dev_name);
+ if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH)
+ return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, complete_len + 1);
- ptr[0] = name_len + 1;
+ /* use short name if present */
+ short_len = strlen(hdev->short_name);
+ if (short_len)
+ return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
+ hdev->short_name, short_len + 1);
- memcpy(ptr + 2, hdev->dev_name, name_len);
+ /* use shortened full name if present, we already know that name
+ * is longer then HCI_MAX_SHORT_NAME_LENGTH
+ */
+ if (complete_len) {
+ u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1];
- ad_len += (name_len + 2);
- ptr += (name_len + 2);
+ memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH);
+ name[HCI_MAX_SHORT_NAME_LENGTH] = '\0';
+
+ return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name,
+ sizeof(name));
}
return ad_len;
}
+static u8 append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
+{
+ return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance);
+}
+
+static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+ u8 scan_rsp_len = 0;
+
+ if (hdev->appearance) {
+ scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len);
+ }
+
+ return append_local_name(hdev, ptr, scan_rsp_len);
+}
+
static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
u8 *ptr)
{
struct adv_info *adv_instance;
+ u32 instance_flags;
+ u8 scan_rsp_len = 0;
adv_instance = hci_find_adv_instance(hdev, instance);
if (!adv_instance)
return 0;
- /* TODO: Set the appropriate entries based on advertising instance flags
- * here once flags other than 0 are supported.
- */
- memcpy(ptr, adv_instance->scan_rsp_data,
+ instance_flags = adv_instance->flags;
+
+ if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
+ scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len);
+ }
+
+ memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data,
adv_instance->scan_rsp_len);
- return adv_instance->scan_rsp_len;
+ scan_rsp_len += adv_instance->scan_rsp_len;
+
+ if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len);
+
+ return scan_rsp_len;
}
void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
@@ -1194,7 +1229,7 @@ static void adv_timeout_expire(struct work_struct *work)
hci_req_init(&req, hdev);
- hci_req_clear_adv_instance(hdev, &req, instance, false);
+ hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
if (list_empty(&hdev->adv_instances))
__hci_req_disable_advertising(&req);
@@ -1284,8 +1319,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
* setting.
* - force == false: Only instances that have a timeout will be removed.
*/
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force)
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+ struct hci_request *req, u8 instance,
+ bool force)
{
struct adv_info *adv_instance, *n, *next_instance = NULL;
int err;
@@ -1311,7 +1347,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
rem_inst = adv_instance->instance;
err = hci_remove_adv_instance(hdev, rem_inst);
if (!err)
- mgmt_advertising_removed(NULL, hdev, rem_inst);
+ mgmt_advertising_removed(sk, hdev, rem_inst);
}
} else {
adv_instance = hci_find_adv_instance(hdev, instance);
@@ -1325,7 +1361,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
err = hci_remove_adv_instance(hdev, instance);
if (!err)
- mgmt_advertising_removed(NULL, hdev, instance);
+ mgmt_advertising_removed(sk, hdev, instance);
}
}
@@ -1716,7 +1752,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
* function. To be safe hard-code one of the
* values that's suitable for SCO.
*/
- rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES;
+ rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
sizeof(rej), &rej);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index b2d044bdc732..dde77bd59f91 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -20,6 +20,8 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <asm/unaligned.h>
+
#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock)
#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock)
@@ -73,8 +75,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
bool force);
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force);
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+ struct hci_request *req, u8 instance,
+ bool force);
void __hci_req_update_class(struct hci_request *req);
@@ -102,3 +105,26 @@ static inline void hci_update_background_scan(struct hci_dev *hdev)
void hci_request_setup(struct hci_dev *hdev);
void hci_request_cancel_all(struct hci_dev *hdev);
+
+u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len);
+
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type,
+ u8 *data, u8 data_len)
+{
+ eir[eir_len++] = sizeof(type) + data_len;
+ eir[eir_len++] = type;
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
+{
+ eir[eir_len++] = sizeof(type) + sizeof(data);
+ eir[eir_len++] = type;
+ put_unaligned_le16(data, &eir[eir_len]);
+ eir_len += sizeof(data);
+
+ return eir_len;
+}
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 96f04b7b9556..48f9471e7c85 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/utsname.h>
+#include <linux/sched.h>
#include <asm/unaligned.h>
#include <net/bluetooth/bluetooth.h>
@@ -38,6 +39,8 @@
static LIST_HEAD(mgmt_chan_list);
static DEFINE_MUTEX(mgmt_chan_list_lock);
+static DEFINE_IDA(sock_cookie_ida);
+
static atomic_t monitor_promisc = ATOMIC_INIT(0);
/* ----- HCI socket interface ----- */
@@ -52,6 +55,8 @@ struct hci_pinfo {
__u32 cmsg_mask;
unsigned short channel;
unsigned long flags;
+ __u32 cookie;
+ char comm[TASK_COMM_LEN];
};
void hci_sock_set_flag(struct sock *sk, int nr)
@@ -74,6 +79,38 @@ unsigned short hci_sock_get_channel(struct sock *sk)
return hci_pi(sk)->channel;
}
+u32 hci_sock_get_cookie(struct sock *sk)
+{
+ return hci_pi(sk)->cookie;
+}
+
+static bool hci_sock_gen_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (!id) {
+ id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+ if (id < 0)
+ id = 0xffffffff;
+
+ hci_pi(sk)->cookie = id;
+ get_task_comm(hci_pi(sk)->comm, current);
+ return true;
+ }
+
+ return false;
+}
+
+static void hci_sock_free_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (id) {
+ hci_pi(sk)->cookie = 0xffffffff;
+ ida_simple_remove(&sock_cookie_ida, id);
+ }
+}
+
static inline int hci_test_bit(int nr, const void *addr)
{
return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -305,6 +342,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(skb_copy);
}
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+ void *data, u16 data_len, ktime_t tstamp,
+ int flag, struct sock *skip_sk)
+{
+ struct sock *sk;
+ __le16 index;
+
+ if (hdev)
+ index = cpu_to_le16(hdev->id);
+ else
+ index = cpu_to_le16(MGMT_INDEX_NONE);
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+ continue;
+
+ /* Ignore socket without the flag set */
+ if (!hci_sock_test_flag(sk, flag))
+ continue;
+
+ /* Skip the original socket */
+ if (sk == skip_sk)
+ continue;
+
+ skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
+ if (!skb)
+ continue;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(event, skb_put(skb, 2));
+
+ if (data)
+ memcpy(skb_put(skb, data_len), data, data_len);
+
+ skb->tstamp = tstamp;
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
{
struct hci_mon_hdr *hdr;
@@ -384,6 +475,129 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
return skb;
}
+static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ u16 format;
+ u8 ver[3];
+ u32 flags;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ format = 0x0000;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_USER:
+ format = 0x0001;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_CONTROL:
+ format = 0x0002;
+ mgmt_fill_version_info(ver);
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(format, skb_put(skb, 2));
+ memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver));
+ put_unaligned_le32(flags, skb_put(skb, 4));
+ *skb_put(skb, 1) = TASK_COMM_LEN;
+ memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(4, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
+ u16 opcode, u16 len,
+ const void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ memcpy(skb_put(skb, len), buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
static void __printf(2, 3)
send_monitor_note(struct sock *sk, const char *fmt, ...)
{
@@ -458,6 +672,26 @@ static void send_monitor_replay(struct sock *sk)
read_unlock(&hci_dev_list_lock);
}
+static void send_monitor_control_replay(struct sock *mon_sk)
+{
+ struct sock *sk;
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_ctrl_open(sk);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(mon_sk, skb))
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
/* Generate internal stack event */
static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
{
@@ -585,6 +819,7 @@ static int hci_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct hci_dev *hdev;
+ struct sk_buff *skb;
BT_DBG("sock %p sk %p", sock, sk);
@@ -593,8 +828,24 @@ static int hci_sock_release(struct socket *sock)
hdev = hci_pi(sk)->hdev;
- if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_MONITOR:
atomic_dec(&monitor_promisc);
+ break;
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ hci_sock_free_cookie(sk);
+ break;
+ }
bt_sock_unlink(&hci_sk_list, sk);
@@ -721,6 +972,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
goto done;
}
+ /* When calling an ioctl on an unbound raw socket, then ensure
+ * that the monitor gets informed. Ensure that the resulting event
+ * is only send once by checking if the cookie exists or not. The
+ * socket cookie will be only ever generated once for the lifetime
+ * of a given socket.
+ */
+ if (hci_sock_gen_cookie(sk)) {
+ struct sk_buff *skb;
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
release_sock(sk);
switch (cmd) {
@@ -784,6 +1056,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
struct sockaddr_hci haddr;
struct sock *sk = sock->sk;
struct hci_dev *hdev = NULL;
+ struct sk_buff *skb;
int len, err = 0;
BT_DBG("sock %p sk %p", sock, sk);
@@ -822,7 +1095,35 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
atomic_inc(&hdev->promisc);
}
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * then there has been already an ioctl issued against
+ * an unbound socket and with that triggerd an open
+ * notification. Send a close notification first to
+ * allow the state transition to bounded.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
break;
case HCI_CHANNEL_USER:
@@ -884,9 +1185,38 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
}
}
- atomic_inc(&hdev->promisc);
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * this socket will transition from a raw socket into
+ * an user channel socket. For a clean transition, send
+ * the close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* The user channel is restricted to CAP_NET_ADMIN
+ * capabilities and with that implicitly trusted.
+ */
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ atomic_inc(&hdev->promisc);
break;
case HCI_CHANNEL_MONITOR:
@@ -900,6 +1230,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
goto done;
}
+ hci_pi(sk)->channel = haddr.hci_channel;
+
/* The monitor interface is restricted to CAP_NET_RAW
* capabilities and with that implicitly trusted.
*/
@@ -908,9 +1240,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
send_monitor_note(sk, "Linux version %s (%s)",
init_utsname()->release,
init_utsname()->machine);
- send_monitor_note(sk, "Bluetooth subsystem version %s",
- BT_SUBSYS_VERSION);
+ send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
+ BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
send_monitor_replay(sk);
+ send_monitor_control_replay(sk);
atomic_inc(&monitor_promisc);
break;
@@ -925,6 +1258,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
err = -EPERM;
goto done;
}
+
+ hci_pi(sk)->channel = haddr.hci_channel;
break;
default:
@@ -946,6 +1281,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
if (capable(CAP_NET_ADMIN))
hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+ hci_pi(sk)->channel = haddr.hci_channel;
+
/* At the moment the index and unconfigured index events
* are enabled unconditionally. Setting them on each
* socket when binding keeps this functionality. They
@@ -956,16 +1293,40 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
* received by untrusted users. Example for such events
* are changes to settings, class of device, name etc.
*/
- if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been
+ * assigned, this socket will transtion from
+ * a raw socket into a control socket. To
+ * allow for a clean transtion, send the
+ * close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
- hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
}
break;
}
-
- hci_pi(sk)->channel = haddr.hci_channel;
sk->sk_state = BT_BOUND;
done:
@@ -1133,6 +1494,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
goto done;
}
+ if (chan->channel == HCI_CHANNEL_CONTROL) {
+ struct sk_buff *skb;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_command(sk, index, opcode, len,
+ buf + sizeof(*hdr));
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
if (opcode >= chan->handler_count ||
chan->handlers[opcode].func == NULL) {
BT_DBG("Unknown op %u", opcode);
@@ -1440,6 +1814,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
+ if (level != SOL_HCI)
+ return -ENOPROTOOPT;
+
lock_sock(sk);
if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -1523,6 +1900,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
+ if (level != SOL_HCI)
+ return -ENOPROTOOPT;
+
if (get_user(len, optlen))
return -EFAULT;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d4cad29b033f..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
void l2cap_chan_hold(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+ BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
kref_get(&c->kref);
}
void l2cap_chan_put(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+ BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
kref_put(&c->kref, l2cap_chan_destroy);
}
@@ -2127,7 +2127,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
struct sk_buff **frag;
int sent = 0;
- if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count)
+ if (!copy_from_iter_full(skb_put(skb, count), count, &msg->msg_iter))
return -EFAULT;
sent += count;
@@ -2147,8 +2147,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
*frag = tmp;
- if (copy_from_iter(skb_put(*frag, count), count,
- &msg->msg_iter) != count)
+ if (!copy_from_iter_full(skb_put(*frag, count), count,
+ &msg->msg_iter))
return -EFAULT;
sent += count;
@@ -7060,7 +7060,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst,
dst_type, __le16_to_cpu(psm));
- hdev = hci_get_route(dst, &chan->src);
+ hdev = hci_get_route(dst, &chan->src, chan->src_type);
if (!hdev)
return -EHOSTUNREACH;
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index 8319c8440c89..cb670b5594eb 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -11,6 +11,8 @@
#include "leds.h"
+DEFINE_LED_TRIGGER(bt_power_led_trigger);
+
struct hci_basic_led_trigger {
struct led_trigger led_trigger;
struct hci_dev *hdev;
@@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
if (hdev->power_led)
led_trigger_event(hdev->power_led,
enabled ? LED_FULL : LED_OFF);
+
+ if (!enabled) {
+ struct hci_dev *d;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (test_bit(HCI_UP, &d->flags))
+ enabled = true;
+ }
+
+ read_unlock(&hci_dev_list_lock);
+ }
+
+ led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
}
static void power_activate(struct led_classdev *led_cdev)
@@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev)
/* initialize power_led */
hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
}
+
+void bt_leds_init(void)
+{
+ led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger);
+}
+
+void bt_leds_cleanup(void)
+{
+ led_trigger_unregister_simple(bt_power_led_trigger);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index a9c4d6ea01cf..08725a2fbd9b 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -7,10 +7,20 @@
*/
#if IS_ENABLED(CONFIG_BT_LEDS)
+
void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
void hci_leds_init(struct hci_dev *hdev);
+
+void bt_leds_init(void);
+void bt_leds_cleanup(void);
+
#else
+
static inline void hci_leds_update_powered(struct hci_dev *hdev,
bool enabled) {}
static inline void hci_leds_init(struct hci_dev *hdev) {}
+
+static inline void bt_leds_init(void) {}
+static inline void bt_leds_cleanup(void) {}
+
#endif
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7639290b6de3..1fba2a03f8ae 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 13
+#define MGMT_REVISION 14
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -104,6 +104,8 @@ static const u16 mgmt_commands[] = {
MGMT_OP_REMOVE_ADVERTISING,
MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_OP_START_LIMITED_DISCOVERY,
+ MGMT_OP_READ_EXT_INFO,
+ MGMT_OP_SET_APPEARANCE,
};
static const u16 mgmt_events[] = {
@@ -141,6 +143,7 @@ static const u16 mgmt_events[] = {
MGMT_EV_LOCAL_OOB_DATA_UPDATED,
MGMT_EV_ADVERTISING_ADDED,
MGMT_EV_ADVERTISING_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
};
static const u16 mgmt_untrusted_commands[] = {
@@ -149,6 +152,7 @@ static const u16 mgmt_untrusted_commands[] = {
MGMT_OP_READ_UNCONF_INDEX_LIST,
MGMT_OP_READ_CONFIG_INFO,
MGMT_OP_READ_EXT_INDEX_LIST,
+ MGMT_OP_READ_EXT_INFO,
};
static const u16 mgmt_untrusted_events[] = {
@@ -162,6 +166,7 @@ static const u16 mgmt_untrusted_events[] = {
MGMT_EV_NEW_CONFIG_OPTIONS,
MGMT_EV_EXT_INDEX_ADDED,
MGMT_EV_EXT_INDEX_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
};
#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
@@ -256,13 +261,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
flag, skip_sk);
}
-static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
- u16 len, struct sock *skip_sk)
-{
- return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
- HCI_MGMT_GENERIC_EVENTS, skip_sk);
-}
-
static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
struct sock *skip_sk)
{
@@ -278,6 +276,14 @@ static u8 le_addr_type(u8 mgmt_addr_type)
return ADDR_LE_DEV_RANDOM;
}
+void mgmt_fill_version_info(void *ver)
+{
+ struct mgmt_rp_read_version *rp = ver;
+
+ rp->version = MGMT_VERSION;
+ rp->revision = cpu_to_le16(MGMT_REVISION);
+}
+
static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
{
@@ -285,8 +291,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
BT_DBG("sock %p", sk);
- rp.version = MGMT_VERSION;
- rp.revision = cpu_to_le16(MGMT_REVISION);
+ mgmt_fill_version_info(&rp);
return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
&rp, sizeof(rp));
@@ -572,8 +577,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip)
{
__le32 options = get_missing_options(hdev);
- return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
- sizeof(options), skip);
+ return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+ sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
}
static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
@@ -862,6 +867,86 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
sizeof(rp));
}
+static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
+{
+ u16 eir_len = 0;
+ size_t name_len;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
+ hdev->appearance);
+
+ name_len = strlen(hdev->dev_name);
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, name_len);
+
+ name_len = strlen(hdev->short_name);
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
+ hdev->short_name, name_len);
+
+ return eir_len;
+}
+
+static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ char buf[512];
+ struct mgmt_rp_read_ext_info *rp = (void *)buf;
+ u16 eir_len;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ memset(&buf, 0, sizeof(buf));
+
+ hci_dev_lock(hdev);
+
+ bacpy(&rp->bdaddr, &hdev->bdaddr);
+
+ rp->version = hdev->hci_ver;
+ rp->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
+ rp->current_settings = cpu_to_le32(get_current_settings(hdev));
+
+
+ eir_len = append_eir_data_to_buf(hdev, rp->eir);
+ rp->eir_len = cpu_to_le16(eir_len);
+
+ hci_dev_unlock(hdev);
+
+ /* If this command is called at least once, then the events
+ * for class of device and local name changes are disabled
+ * and only the new extended controller information event
+ * is used.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp,
+ sizeof(*rp) + eir_len);
+}
+
+static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ char buf[512];
+ struct mgmt_ev_ext_info_changed *ev = (void *)buf;
+ u16 eir_len;
+
+ memset(buf, 0, sizeof(buf));
+
+ eir_len = append_eir_data_to_buf(hdev, ev->eir);
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev,
+ sizeof(*ev) + eir_len,
+ HCI_MGMT_EXT_INFO_EVENTS, skip);
+}
+
static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
{
__le32 settings = cpu_to_le32(get_current_settings(hdev));
@@ -922,7 +1007,7 @@ static int clean_up_hci_state(struct hci_dev *hdev)
hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
- hci_req_clear_adv_instance(hdev, NULL, 0x00, false);
+ hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false);
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
__hci_req_disable_advertising(&req);
@@ -1000,8 +1085,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip)
{
__le32 ev = cpu_to_le32(get_current_settings(hdev));
- return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
- sizeof(ev), skip);
+ return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+ sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
}
int mgmt_new_settings(struct hci_dev *hdev)
@@ -1690,7 +1775,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
enabled = lmp_host_le_capable(hdev);
if (!val)
- hci_req_clear_adv_instance(hdev, NULL, 0x00, true);
+ hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
if (!hdev_is_powered(hdev) || val == enabled) {
bool changed = false;
@@ -2435,6 +2520,8 @@ static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
if (!cmd)
return -ENOMEM;
+ cmd->cmd_complete = addr_cmd_complete;
+
err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
sizeof(cp->addr.bdaddr), &cp->addr.bdaddr);
if (err < 0)
@@ -2513,8 +2600,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
BT_DBG("");
if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
- return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
- MGMT_STATUS_INVALID_PARAMS, NULL, 0);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_STATUS_INVALID_PARAMS);
hci_dev_lock(hdev);
@@ -2932,6 +3019,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
}
+static void adv_expire(struct hci_dev *hdev, u32 flags)
+{
+ struct adv_info *adv_instance;
+ struct hci_request req;
+ int err;
+
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+ if (!adv_instance)
+ return;
+
+ /* stop if current instance doesn't need to be changed */
+ if (!(adv_instance->flags & flags))
+ return;
+
+ cancel_adv_timeout(hdev);
+
+ adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
+ if (!adv_instance)
+ return;
+
+ hci_req_init(&req, hdev);
+ err = __hci_req_schedule_adv_instance(&req, adv_instance->instance,
+ true);
+ if (err)
+ return;
+
+ hci_req_run(&req, NULL);
+}
+
static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
struct mgmt_cp_set_local_name *cp;
@@ -2947,13 +3063,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
cp = cmd->param;
- if (status)
+ if (status) {
mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
mgmt_status(status));
- else
+ } else {
mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
cp, sizeof(*cp));
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+ }
+
mgmt_pending_remove(cmd);
unlock:
@@ -2993,8 +3113,9 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
if (err < 0)
goto failed;
- err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev,
- data, len, sk);
+ err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
+ len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
+ ext_info_changed(hdev, sk);
goto failed;
}
@@ -3017,7 +3138,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
/* The name is stored in the scan response data and so
* no need to udpate the advertising data here.
*/
- if (lmp_le_capable(hdev))
+ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING))
__hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance);
err = hci_req_run(&req, set_name_complete);
@@ -3029,6 +3150,40 @@ failed:
return err;
}
+static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_appearance *cp = data;
+ u16 apperance;
+ int err;
+
+ BT_DBG("");
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ apperance = le16_to_cpu(cp->appearance);
+
+ hci_dev_lock(hdev);
+
+ if (hdev->appearance != apperance) {
+ hdev->appearance = apperance;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+
+ ext_info_changed(hdev, sk);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
+ 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -4869,7 +5024,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
int err;
memset(&rp, 0, sizeof(rp));
- memcpy(&rp.addr, &cmd->param, sizeof(rp.addr));
+ memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
if (status)
goto complete;
@@ -5501,17 +5656,6 @@ unlock:
return err;
}
-static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
- u8 data_len)
-{
- eir[eir_len++] = sizeof(type) + data_len;
- eir[eir_len++] = type;
- memcpy(&eir[eir_len], data, data_len);
- eir_len += data_len;
-
- return eir_len;
-}
-
static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -5815,6 +5959,8 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_DISCOV;
flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+ flags |= MGMT_ADV_FLAG_APPEARANCE;
+ flags |= MGMT_ADV_FLAG_LOCAL_NAME;
if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
flags |= MGMT_ADV_FLAG_TX_POWER;
@@ -5871,28 +6017,67 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
return err;
}
-static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
- u8 len, bool is_adv_data)
+static u8 calculate_name_len(struct hci_dev *hdev)
+{
+ u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 3];
+
+ return append_local_name(hdev, buf, 0);
+}
+
+static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags,
+ bool is_adv_data)
{
u8 max_len = HCI_MAX_AD_LENGTH;
- int i, cur_len;
- bool flags_managed = false;
- bool tx_power_managed = false;
if (is_adv_data) {
if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS)) {
- flags_managed = true;
+ MGMT_ADV_FLAG_MANAGED_FLAGS))
max_len -= 3;
- }
- if (adv_flags & MGMT_ADV_FLAG_TX_POWER) {
- tx_power_managed = true;
+ if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
max_len -= 3;
- }
+ } else {
+ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ max_len -= calculate_name_len(hdev);
+
+ if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
+ max_len -= 4;
}
+ return max_len;
+}
+
+static bool flags_managed(u32 adv_flags)
+{
+ return adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS);
+}
+
+static bool tx_power_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_TX_POWER;
+}
+
+static bool name_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME;
+}
+
+static bool appearance_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_APPEARANCE;
+}
+
+static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
+ u8 len, bool is_adv_data)
+{
+ int i, cur_len;
+ u8 max_len;
+
+ max_len = tlv_data_max_len(hdev, adv_flags, is_adv_data);
+
if (len > max_len)
return false;
@@ -5900,10 +6085,21 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
cur_len = data[i];
- if (flags_managed && data[i + 1] == EIR_FLAGS)
+ if (data[i + 1] == EIR_FLAGS &&
+ (!is_adv_data || flags_managed(adv_flags)))
+ return false;
+
+ if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags))
return false;
- if (tx_power_managed && data[i + 1] == EIR_TX_POWER)
+ if (data[i + 1] == EIR_APPEARANCE &&
+ appearance_managed(adv_flags))
return false;
/* If the current field length would exceed the total data
@@ -6175,7 +6371,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- hci_req_clear_adv_instance(hdev, &req, cp->instance, true);
+ hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true);
if (list_empty(&hdev->adv_instances))
__hci_req_disable_advertising(&req);
@@ -6211,23 +6407,6 @@ unlock:
return err;
}
-static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
-{
- u8 max_len = HCI_MAX_AD_LENGTH;
-
- if (is_adv_data) {
- if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
- MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS))
- max_len -= 3;
-
- if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
- max_len -= 3;
- }
-
- return max_len;
-}
-
static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
void *data, u16 data_len)
{
@@ -6258,8 +6437,8 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
rp.instance = cp->instance;
rp.flags = cp->flags;
- rp.max_adv_data_len = tlv_data_max_len(flags, true);
- rp.max_scan_rsp_len = tlv_data_max_len(flags, false);
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
@@ -6356,6 +6535,9 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
{ get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE },
{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
+ { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_appearance, MGMT_SET_APPEARANCE_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
@@ -6494,9 +6676,12 @@ void __mgmt_power_off(struct hci_dev *hdev)
mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
- if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
- mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
- zero_cod, sizeof(zero_cod), NULL);
+ if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+ zero_cod, sizeof(zero_cod),
+ HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
new_settings(hdev, match.sk);
@@ -7092,9 +7277,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
- if (!status)
- mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
- dev_class, 3, NULL);
+ if (!status) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
+ 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
if (match.sk)
sock_put(match.sk);
@@ -7123,8 +7310,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
return;
}
- mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
- cmd ? cmd->sk : NULL);
+ mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
+ ext_info_changed(hdev, cmd ? cmd->sk : NULL);
}
static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 8c30c7eb8bef..c933bd08c1fe 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,12 +21,41 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <asm/unaligned.h>
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
#include <net/bluetooth/mgmt.h>
#include "mgmt_util.h"
+static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
+ u16 opcode, u16 len, void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ memcpy(skb_put(skb, len), buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
void *data, u16 data_len, int flag, struct sock *skip_sk)
{
@@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
__net_timestamp(skb);
hci_send_to_channel(channel, skb, flag, skip_sk);
- kfree_skb(skb);
+ if (channel == HCI_CHANNEL_CONTROL)
+ hci_send_monitor_ctrl_event(hdev, event, data, data_len,
+ skb_get_ktime(skb), flag, skip_sk);
+
+ kfree_skb(skb);
return 0;
}
int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *mskb;
struct mgmt_hdr *hdr;
struct mgmt_ev_cmd_status *ev;
int err;
@@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
ev->status = status;
ev->opcode = cpu_to_le16(cmd);
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
err = sock_queue_rcv_skb(sk, skb);
if (err < 0)
kfree_skb(skb);
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
return err;
}
int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
void *rp, size_t rp_len)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *mskb;
struct mgmt_hdr *hdr;
struct mgmt_ev_cmd_complete *ev;
int err;
@@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
if (rp)
memcpy(ev->data, rp, rp_len);
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_COMPLETE,
+ sizeof(*ev) + rp_len, ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
err = sock_queue_rcv_skb(sk, skb);
if (err < 0)
kfree_skb(skb);
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
return err;
}
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 8e385a0ae60e..2f2cb5e27cdd 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -178,7 +178,7 @@ static void rfcomm_reparent_device(struct rfcomm_dev *dev)
struct hci_dev *hdev;
struct hci_conn *conn;
- hdev = hci_get_route(&dev->dst, &dev->src);
+ hdev = hci_get_route(&dev->dst, &dev->src, BDADDR_BREDR);
if (!hdev)
return;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index f52bcbf2e58c..3125ce670c2f 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -219,7 +219,7 @@ static int sco_connect(struct sock *sk)
BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst);
- hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src);
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
if (!hdev)
return -EHOSTUNREACH;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 4c1a16a96ae5..fae391f1871f 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -57,7 +57,7 @@
#define SMP_TIMEOUT msecs_to_jiffies(30000)
#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
- 0x1f : 0x07)
+ 0x3f : 0x07)
#define KEY_DIST_MASK 0x07
/* Maximum message length that can be passed to aes_cmac */
@@ -76,6 +76,7 @@ enum {
SMP_FLAG_DHKEY_PENDING,
SMP_FLAG_REMOTE_OOB,
SMP_FLAG_LOCAL_OOB,
+ SMP_FLAG_CT2,
};
struct smp_dev {
@@ -357,6 +358,22 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
return err;
}
+static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
+ const u8 salt[16], u8 res[16])
+{
+ int err;
+
+ SMP_DBG("w %16phN salt %16phN", w, salt);
+
+ err = aes_cmac(tfm_cmac, salt, w, 16, res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
/* The following functions map to the legacy SMP crypto functions e, c1,
* s1 and ah.
*/
@@ -1130,20 +1147,31 @@ static void sc_add_ltk(struct smp_chan *smp)
static void sc_generate_link_key(struct smp_chan *smp)
{
- /* These constants are as specified in the core specification.
- * In ASCII they spell out to 'tmp1' and 'lebr'.
- */
- const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+ /* From core spec. Spells out in ASCII as 'lebr'. */
const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c };
smp->link_key = kzalloc(16, GFP_KERNEL);
if (!smp->link_key)
return;
- if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
- kzfree(smp->link_key);
- smp->link_key = NULL;
- return;
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x00000000000000000000000000000000746D7031 */
+ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp1'. */
+ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
}
if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
@@ -1169,10 +1197,7 @@ static void smp_allow_key_dist(struct smp_chan *smp)
static void sc_generate_ltk(struct smp_chan *smp)
{
- /* These constants are as specified in the core specification.
- * In ASCII they spell out to 'tmp2' and 'brle'.
- */
- const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+ /* From core spec. Spells out in ASCII as 'brle'. */
const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 };
struct hci_conn *hcon = smp->conn->hcon;
struct hci_dev *hdev = hcon->hdev;
@@ -1187,8 +1212,19 @@ static void sc_generate_ltk(struct smp_chan *smp)
if (key->type == HCI_LK_DEBUG_COMBINATION)
set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
- if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
- return;
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x00000000000000000000000000000000746D7032 */
+ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
+ return;
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp2'. */
+ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
+ return;
+ }
if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk))
return;
@@ -1669,6 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
if (!rsp) {
memset(req, 0, sizeof(*req));
+ req->auth_req = SMP_AUTH_CT2;
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
req->max_key_size = conn->hcon->enc_key_size;
@@ -1680,6 +1717,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
memset(rsp, 0, sizeof(*rsp));
+ rsp->auth_req = SMP_AUTH_CT2;
rsp->max_key_size = conn->hcon->enc_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
@@ -1744,6 +1782,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
build_bredr_pairing_cmd(smp, req, &rsp);
+ if (req->auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
key_size = min(req->max_key_size, rsp.max_key_size);
if (check_enc_key_size(conn, key_size))
return SMP_ENC_KEY_SIZE;
@@ -1761,9 +1802,13 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
build_pairing_cmd(conn, req, &rsp, auth);
- if (rsp.auth_req & SMP_AUTH_SC)
+ if (rsp.auth_req & SMP_AUTH_SC) {
set_bit(SMP_FLAG_SC, &smp->flags);
+ if (rsp.auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+ }
+
if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
sec_level = BT_SECURITY_MEDIUM;
else
@@ -1917,6 +1962,9 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
*/
smp->remote_key_dist &= rsp->resp_key_dist;
+ if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2))
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
/* For BR/EDR this means we're done and can start phase 3 */
if (conn->hcon->type == ACL_LINK) {
/* Clear bits which are generated but not distributed */
@@ -2312,8 +2360,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
authreq = seclevel_to_authreq(sec_level);
- if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED))
+ if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) {
authreq |= SMP_AUTH_SC;
+ if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED))
+ authreq |= SMP_AUTH_CT2;
+ }
/* Require MITM if IO Capability allows or the security level
* requires it.
@@ -3387,7 +3438,10 @@ int smp_register(struct hci_dev *hdev)
if (!lmp_sc_capable(hdev)) {
debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
hdev, &force_bredr_smp_fops);
- return 0;
+
+ /* Flag can be already set here (due to power toggle) */
+ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return 0;
}
if (WARN_ON(hdev->smp_bredr_data)) {
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index ffcc70b6b199..0ff6247eaa6c 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -57,6 +57,7 @@ struct smp_cmd_pairing {
#define SMP_AUTH_MITM 0x04
#define SMP_AUTH_SC 0x08
#define SMP_AUTH_KEYPRESS 0x10
+#define SMP_AUTH_CT2 0x20
#define SMP_CMD_PAIRING_CONFIRM 0x03
struct smp_cmd_pairing_confirm {
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index a1cda5d4718d..0aefc011b668 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -20,4 +20,6 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
+bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
+
obj-$(CONFIG_NETFILTER) += netfilter/
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 3addc05b9a16..889e5640455f 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -227,9 +227,11 @@ static int __init br_init(void)
br_fdb_test_addr_hook = br_fdb_test_addr;
#endif
- pr_info("bridge: automatic filtering via arp/ip/ip6tables has been "
- "deprecated. Update your scripts to load br_netfilter if you "
+#if IS_MODULE(CONFIG_BRIDGE_NETFILTER)
+ pr_info("bridge: filtering via arp/ip/ip6tables is no longer available "
+ "by default. Update your scripts to load br_netfilter if you "
"need this.\n");
+#endif
return 0;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 09f26940aba5..ed3b3192fb00 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -19,7 +19,7 @@
#include <linux/list.h>
#include <linux/netfilter_bridge.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
@@ -62,10 +62,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
goto out;
if (is_broadcast_ether_addr(dest)) {
- br_flood(br, skb, false, false, true);
+ br_flood(br, skb, BR_PKT_BROADCAST, false, true);
} else if (is_multicast_ether_addr(dest)) {
if (unlikely(netpoll_tx_running(dev))) {
- br_flood(br, skb, false, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true);
goto out;
}
if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -78,11 +78,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_multicast_querier_exists(br, eth_hdr(skb)))
br_multicast_flood(mdst, skb, false, true);
else
- br_flood(br, skb, false, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true);
} else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) {
br_forward(dst->dst, skb, false, true);
} else {
- br_flood(br, skb, true, false, true);
+ br_flood(br, skb, BR_PKT_UNICAST, false, true);
}
out:
rcu_read_unlock();
@@ -185,7 +185,7 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
static int br_change_mtu(struct net_device *dev, int new_mtu)
{
struct net_bridge *br = netdev_priv(dev);
- if (new_mtu < 68 || new_mtu > br_min_mtu(br))
+ if (new_mtu > br_min_mtu(br))
return -EINVAL;
dev->mtu = new_mtu;
@@ -409,7 +409,8 @@ void br_dev_setup(struct net_device *dev)
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
- br->ageing_time = BR_DEFAULT_AGEING_TIME;
+ br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
+ dev->max_mtu = ETH_MAX_MTU;
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index cd620fab41b0..e4a4176171c9 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -535,9 +535,8 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
*/
if (fdb->is_local)
return 0;
- br_warn(br, "adding interface %s with same address "
- "as a received packet\n",
- source ? source->dev->name : br->dev->name);
+ br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
+ source ? source->dev->name : br->dev->name, addr, vid);
fdb_delete(br, fdb);
}
@@ -583,9 +582,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
if (net_ratelimit())
- br_warn(br, "received packet on %s with "
- "own address as source address\n",
- source->dev->name);
+ br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
+ source->dev->name, addr, vid);
} else {
/* fastpath: update of existing entry */
if (unlikely(source != fdb->dst)) {
@@ -710,24 +708,27 @@ int br_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev,
- int idx)
+ int *idx)
{
struct net_bridge *br = netdev_priv(dev);
+ int err = 0;
int i;
if (!(dev->priv_flags & IFF_EBRIDGE))
goto out;
- if (!filter_dev)
- idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+ if (!filter_dev) {
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+ if (err < 0)
+ goto out;
+ }
for (i = 0; i < BR_HASH_SIZE; i++) {
struct net_bridge_fdb_entry *f;
hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
- int err;
- if (idx < cb->args[0])
+ if (*idx < cb->args[2])
goto skip;
if (filter_dev &&
@@ -750,17 +751,15 @@ int br_fdb_dump(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI);
- if (err < 0) {
- cb->args[1] = err;
- break;
- }
+ if (err < 0)
+ goto out;
skip:
- ++idx;
+ *idx += 1;
}
}
out:
- return idx;
+ return err;
}
/* Update (create or replace) forwarding database entry */
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 63a83d8d7da3..7cb41aee4c82 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -29,7 +29,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
vg = nbp_vlan_group_rcu(p);
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
- br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING;
+ br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
+ nbp_switchdev_allowed_egress(p, skb);
}
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -175,7 +176,7 @@ out:
/* called under rcu_read_lock */
void br_flood(struct net_bridge *br, struct sk_buff *skb,
- bool unicast, bool local_rcv, bool local_orig)
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
{
u8 igmp_type = br_multicast_igmp_type(skb);
struct net_bridge_port *prev = NULL;
@@ -183,7 +184,10 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
list_for_each_entry_rcu(p, &br->port_list, list) {
/* Do not flood unicast traffic to ports that turn it off */
- if (unicast && !(p->flags & BR_FLOOD))
+ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
+ continue;
+ if (pkt_type == BR_PKT_MULTICAST &&
+ !(p->flags & BR_MCAST_FLOOD))
continue;
/* Do not flood to ports that enable proxy ARP */
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f2fede05d32c..ed0dd3340084 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
- p->flags = BR_LEARNING | BR_FLOOD;
+ p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
@@ -545,6 +545,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (err)
goto err5;
+ err = nbp_switchdev_mark_set(p);
+ if (err)
+ goto err6;
+
dev_disable_lro(dev);
list_add_rcu(&p->list, &br->port_list);
@@ -566,7 +570,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
err = nbp_vlan_init(p);
if (err) {
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
- goto err6;
+ goto err7;
}
spin_lock_bh(&br->lock);
@@ -589,12 +593,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
return 0;
-err6:
+err7:
list_del_rcu(&p->list);
br_fdb_delete_by_port(br, p, 0, 1);
nbp_update_port_count(br);
+err6:
netdev_upper_dev_unlink(dev, br->dev);
-
err5:
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index abe11f085479..855b72fbe1da 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -128,11 +128,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- bool local_rcv = false, mcast_hit = false, unicast = true;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
const unsigned char *dest = eth_hdr(skb)->h_dest;
+ enum br_pkt_type pkt_type = BR_PKT_UNICAST;
struct net_bridge_fdb_entry *dst = NULL;
struct net_bridge_mdb_entry *mdst;
+ bool local_rcv, mcast_hit = false;
struct net_bridge *br;
u16 vid = 0;
@@ -142,29 +143,36 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
goto out;
+ nbp_switchdev_frame_mark(p, skb);
+
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
if (p->flags & BR_LEARNING)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
- if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
- br_multicast_rcv(br, p, skb, vid))
- goto drop;
+ local_rcv = !!(br->dev->flags & IFF_PROMISC);
+ if (is_multicast_ether_addr(dest)) {
+ /* by definition the broadcast is also a multicast address */
+ if (is_broadcast_ether_addr(dest)) {
+ pkt_type = BR_PKT_BROADCAST;
+ local_rcv = true;
+ } else {
+ pkt_type = BR_PKT_MULTICAST;
+ if (br_multicast_rcv(br, p, skb, vid))
+ goto drop;
+ }
+ }
if (p->state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
- local_rcv = !!(br->dev->flags & IFF_PROMISC);
-
if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
br_do_proxy_arp(skb, br, vid, p);
- if (is_broadcast_ether_addr(dest)) {
- local_rcv = true;
- unicast = false;
- } else if (is_multicast_ether_addr(dest)) {
+ switch (pkt_type) {
+ case BR_PKT_MULTICAST:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
@@ -178,18 +186,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
local_rcv = true;
br->dev->stats.multicast++;
}
- unicast = false;
- } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) {
- /* Do not forward the packet since it's local. */
- return br_pass_frame_up(skb);
+ break;
+ case BR_PKT_UNICAST:
+ dst = __br_fdb_get(br, dest, vid);
+ default:
+ break;
}
if (dst) {
+ if (dst->is_local)
+ return br_pass_frame_up(skb);
+
dst->used = jiffies;
br_forward(dst->dst, skb, local_rcv, false);
} else {
if (!mcast_hit)
- br_flood(br, skb, unicast, local_rcv, false);
+ br_flood(br, skb, pkt_type, local_rcv, false);
else
br_multicast_flood(mdst, skb, local_rcv, false);
}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index d99b2009771a..da8157c57eb1 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -18,7 +18,7 @@
#include <linux/slab.h>
#include <linux/times.h>
#include <net/net_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
static int get_bridge_ifindices(struct net *net, int *indices, int num)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index c5fea9393946..b30e77e8427c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/inetdevice.h>
+#include <linux/mroute.h>
#include <net/ip.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -364,13 +365,18 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
__be32 group,
u8 *igmp_type)
{
+ struct igmpv3_query *ihv3;
+ size_t igmp_hdr_size;
struct sk_buff *skb;
struct igmphdr *ih;
struct ethhdr *eth;
struct iphdr *iph;
+ igmp_hdr_size = sizeof(*ih);
+ if (br->multicast_igmp_version == 3)
+ igmp_hdr_size = sizeof(*ihv3);
skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
- sizeof(*ih) + 4);
+ igmp_hdr_size + 4);
if (!skb)
goto out;
@@ -395,7 +401,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
iph->version = 4;
iph->ihl = 6;
iph->tos = 0xc0;
- iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4);
+ iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
@@ -411,17 +417,37 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
skb_put(skb, 24);
skb_set_transport_header(skb, skb->len);
- ih = igmp_hdr(skb);
*igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->code = (group ? br->multicast_last_member_interval :
- br->multicast_query_response_interval) /
- (HZ / IGMP_TIMER_SCALE);
- ih->group = group;
- ih->csum = 0;
- ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- skb_put(skb, sizeof(*ih));
+ switch (br->multicast_igmp_version) {
+ case 2:
+ ih = igmp_hdr(skb);
+ ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ih->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ih->group = group;
+ ih->csum = 0;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
+ break;
+ case 3:
+ ihv3 = igmpv3_query_hdr(skb);
+ ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ihv3->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ihv3->group = group;
+ ihv3->qqic = br->multicast_query_interval / HZ;
+ ihv3->nsrcs = 0;
+ ihv3->resv = 0;
+ ihv3->suppress = 0;
+ ihv3->qrv = 2;
+ ihv3->csum = 0;
+ ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
+ break;
+ }
+
+ skb_put(skb, igmp_hdr_size);
__skb_pull(skb, sizeof(*eth));
out:
@@ -433,15 +459,20 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
const struct in6_addr *grp,
u8 *igmp_type)
{
- struct sk_buff *skb;
+ struct mld2_query *mld2q;
+ unsigned long interval;
struct ipv6hdr *ip6h;
struct mld_msg *mldq;
+ size_t mld_hdr_size;
+ struct sk_buff *skb;
struct ethhdr *eth;
u8 *hopopt;
- unsigned long interval;
+ mld_hdr_size = sizeof(*mldq);
+ if (br->multicast_mld_version == 2)
+ mld_hdr_size = sizeof(*mld2q);
skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
- 8 + sizeof(*mldq));
+ 8 + mld_hdr_size);
if (!skb)
goto out;
@@ -460,7 +491,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h = ipv6_hdr(skb);
*(__force __be32 *)ip6h = htonl(0x60000000);
- ip6h->payload_len = htons(8 + sizeof(*mldq));
+ ip6h->payload_len = htons(8 + mld_hdr_size);
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
@@ -488,26 +519,47 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
/* ICMPv6 */
skb_set_transport_header(skb, skb->len);
- mldq = (struct mld_msg *) icmp6_hdr(skb);
-
interval = ipv6_addr_any(grp) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
-
*igmp_type = ICMPV6_MGM_QUERY;
- mldq->mld_type = ICMPV6_MGM_QUERY;
- mldq->mld_code = 0;
- mldq->mld_cksum = 0;
- mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
- mldq->mld_reserved = 0;
- mldq->mld_mca = *grp;
-
- /* checksum */
- mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mldq), IPPROTO_ICMPV6,
- csum_partial(mldq,
- sizeof(*mldq), 0));
- skb_put(skb, sizeof(*mldq));
+ switch (br->multicast_mld_version) {
+ case 1:
+ mldq = (struct mld_msg *)icmp6_hdr(skb);
+ mldq->mld_type = ICMPV6_MGM_QUERY;
+ mldq->mld_code = 0;
+ mldq->mld_cksum = 0;
+ mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
+ mldq->mld_reserved = 0;
+ mldq->mld_mca = *grp;
+ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mldq), IPPROTO_ICMPV6,
+ csum_partial(mldq,
+ sizeof(*mldq),
+ 0));
+ break;
+ case 2:
+ mld2q = (struct mld2_query *)icmp6_hdr(skb);
+ mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval));
+ mld2q->mld2q_type = ICMPV6_MGM_QUERY;
+ mld2q->mld2q_code = 0;
+ mld2q->mld2q_cksum = 0;
+ mld2q->mld2q_resv1 = 0;
+ mld2q->mld2q_resv2 = 0;
+ mld2q->mld2q_suppress = 0;
+ mld2q->mld2q_qrv = 2;
+ mld2q->mld2q_nsrcs = 0;
+ mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
+ mld2q->mld2q_mca = *grp;
+ mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mld2q),
+ IPPROTO_ICMPV6,
+ csum_partial(mld2q,
+ sizeof(*mld2q),
+ 0));
+ break;
+ }
+ skb_put(skb, mld_hdr_size);
__skb_pull(skb, sizeof(*eth));
@@ -607,7 +659,8 @@ err:
}
struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
- struct net_bridge_port *port, struct br_ip *group)
+ struct net_bridge_port *p,
+ struct br_ip *group)
{
struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
@@ -623,7 +676,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
}
hash = br_ip_hash(mdb, group);
- mp = br_multicast_get_group(br, port, group, hash);
+ mp = br_multicast_get_group(br, p, group, hash);
switch (PTR_ERR(mp)) {
case 0:
break;
@@ -680,9 +733,9 @@ static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group)
{
- struct net_bridge_mdb_entry *mp;
- struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_mdb_entry *mp;
unsigned long now = jiffies;
int err;
@@ -860,9 +913,9 @@ static void br_multicast_send_query(struct net_bridge *br,
struct net_bridge_port *port,
struct bridge_mcast_own_query *own_query)
{
- unsigned long time;
- struct br_ip br_group;
struct bridge_mcast_other_query *other_query = NULL;
+ struct br_ip br_group;
+ unsigned long time;
if (!netif_running(br->dev) || br->multicast_disabled ||
!br->multicast_querier)
@@ -972,13 +1025,12 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query)
mod_timer(&query->timer, jiffies);
}
-void br_multicast_enable_port(struct net_bridge_port *port)
+static void __br_multicast_enable_port(struct net_bridge_port *port)
{
struct net_bridge *br = port->br;
- spin_lock(&br->multicast_lock);
if (br->multicast_disabled || !netif_running(br->dev))
- goto out;
+ return;
br_multicast_enable(&port->ip4_own_query);
#if IS_ENABLED(CONFIG_IPV6)
@@ -987,8 +1039,14 @@ void br_multicast_enable_port(struct net_bridge_port *port)
if (port->multicast_router == MDB_RTR_TYPE_PERM &&
hlist_unhashed(&port->rlist))
br_multicast_add_router(br, port);
+}
-out:
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+ struct net_bridge *br = port->br;
+
+ spin_lock(&br->multicast_lock);
+ __br_multicast_enable_port(port);
spin_unlock(&br->multicast_lock);
}
@@ -1633,6 +1691,21 @@ static void br_multicast_err_count(const struct net_bridge *br,
u64_stats_update_end(&pstats->syncp);
}
+static void br_multicast_pim(struct net_bridge *br,
+ struct net_bridge_port *port,
+ const struct sk_buff *skb)
+{
+ unsigned int offset = skb_transport_offset(skb);
+ struct pimhdr *pimhdr, _pimhdr;
+
+ pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr);
+ if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION ||
+ pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
+ return;
+
+ br_multicast_mark_router(br, port);
+}
+
static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
@@ -1645,8 +1718,12 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = ip_mc_check_igmp(skb, &skb_trimmed);
if (err == -ENOMSG) {
- if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr))
+ if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) {
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) {
+ if (ip_hdr(skb)->protocol == IPPROTO_PIM)
+ br_multicast_pim(br, port, skb);
+ }
return 0;
} else if (err < 0) {
br_multicast_err_count(br, port, skb->protocol);
@@ -1806,7 +1883,9 @@ void br_multicast_init(struct net_bridge *br)
br->ip4_other_query.delay_time = 0;
br->ip4_querier.port = NULL;
+ br->multicast_igmp_version = 2;
#if IS_ENABLED(CONFIG_IPV6)
+ br->multicast_mld_version = 1;
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
#endif
@@ -1994,8 +2073,9 @@ static void br_multicast_start_querier(struct net_bridge *br,
int br_multicast_toggle(struct net_bridge *br, unsigned long val)
{
- int err = 0;
struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_port *port;
+ int err = 0;
spin_lock_bh(&br->multicast_lock);
if (br->multicast_disabled == !val)
@@ -2023,10 +2103,9 @@ rollback:
goto rollback;
}
- br_multicast_start_querier(br, &br->ip4_own_query);
-#if IS_ENABLED(CONFIG_IPV6)
- br_multicast_start_querier(br, &br->ip6_own_query);
-#endif
+ br_multicast_open(br);
+ list_for_each_entry(port, &br->port_list, list)
+ __br_multicast_enable_port(port);
unlock:
spin_unlock_bh(&br->multicast_lock);
@@ -2107,6 +2186,44 @@ unlock:
return err;
}
+int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
+{
+ /* Currently we support only version 2 and 3 */
+ switch (val) {
+ case 2:
+ case 3:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ br->multicast_igmp_version = val;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
+{
+ /* Currently we support version 1 and 2 */
+ switch (val) {
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ br->multicast_mld_version = val;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+#endif
+
/**
* br_multicast_list_adjacent - Returns snooped multicast addresses
* @dev: The bridge port adjacent to which to retrieve addresses
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 77e7f69bf80d..95087e6e8258 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -30,6 +30,7 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
#include <linux/in_route.h>
+#include <linux/rculist.h>
#include <linux/inetdevice.h>
#include <net/ip.h>
@@ -39,13 +40,13 @@
#include <net/netfilter/br_netfilter.h>
#include <net/netns/generic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
-static int brnf_net_id __read_mostly;
+static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
bool enabled;
@@ -395,11 +396,10 @@ bridged_dnat:
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE,
- NF_BR_PRE_ROUTING,
- net, sk, skb, skb->dev, NULL,
- br_nf_pre_routing_finish_bridge,
- 1);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev,
+ NULL,
+ br_nf_pre_routing_finish_bridge);
return 0;
}
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -417,10 +417,8 @@ bridged_dnat:
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
- skb->dev, NULL,
- br_handle_frame_finish, 1);
-
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
+ br_handle_frame_finish);
return 0;
}
@@ -563,8 +561,8 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
}
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb,
- in, skb->dev, br_forward_finish, 1);
+ br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
+ br_forward_finish);
return 0;
}
@@ -847,8 +845,10 @@ static unsigned int ip_sabotage_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge && !skb->nf_bridge->in_prerouting)
- return NF_STOP;
+ if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
+ state->okfn(state->net, state->sk, skb);
+ return NF_STOLEN;
+ }
return NF_ACCEPT;
}
@@ -992,6 +992,43 @@ static struct notifier_block brnf_notifier __read_mostly = {
.notifier_call = brnf_device_event,
};
+/* recursively invokes nf_hook_slow (again), skipping already-called
+ * hooks (< NF_BR_PRI_BRNF).
+ *
+ * Called with rcu read lock held.
+ */
+int br_nf_hook_thresh(unsigned int hook, struct net *net,
+ struct sock *sk, struct sk_buff *skb,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct net *, struct sock *,
+ struct sk_buff *))
+{
+ struct nf_hook_entry *elem;
+ struct nf_hook_state state;
+ int ret;
+
+ for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+ elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF;
+ elem = rcu_dereference(elem->next))
+ ;
+
+ if (!elem)
+ return okfn(net, sk, skb);
+
+ /* We may already have this, but read-locks nest anyway */
+ rcu_read_lock();
+ nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
+ sk, net, okfn);
+
+ ret = nf_hook_slow(skb, &state, elem);
+ rcu_read_unlock();
+ if (ret == 1)
+ ret = okfn(net, sk, skb);
+
+ return ret;
+}
+
#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5e59a8457e7b..96c072e71ea2 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -38,7 +38,7 @@
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
- net, sk, skb, skb->dev, NULL,
- br_nf_pre_routing_finish_bridge,
- 1);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev, NULL,
+ br_nf_pre_routing_finish_bridge);
return 0;
}
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
- skb->dev, NULL,
- br_handle_frame_finish, 1);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
+ skb->dev, NULL, br_handle_frame_finish);
return 0;
}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index f2a29e467e78..7109b389ea58 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,10 +169,15 @@ static int br_port_fill_attrs(struct sk_buff *skb,
nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
- nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
- nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROTECT,
+ !!(p->flags & BR_ROOT_BLOCK)) ||
+ nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
+ !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
- nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
+ !!(p->flags & BR_FLOOD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD,
+ !!(p->flags & BR_MCAST_FLOOD)) ||
nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
!!(p->flags & BR_PROXYARP_WIFI)) ||
@@ -630,6 +635,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
@@ -775,20 +781,6 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
return 0;
}
-static int br_dev_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
-{
- struct net_bridge *br = netdev_priv(dev);
-
- if (tb[IFLA_ADDRESS]) {
- spin_lock_bh(&br->lock);
- br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
- spin_unlock_bh(&br->lock);
- }
-
- return register_netdevice(dev);
-}
-
static int br_port_slave_changelink(struct net_device *brdev,
struct net_device *dev,
struct nlattr *tb[],
@@ -852,6 +844,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
[IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
[IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1063,6 +1057,26 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
br->multicast_stats_enabled = !!mcast_stats;
}
+
+ if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
+ __u8 igmp_version;
+
+ igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
+ err = br_multicast_set_igmp_version(br, igmp_version);
+ if (err)
+ return err;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (data[IFLA_BR_MCAST_MLD_VERSION]) {
+ __u8 mld_version;
+
+ mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
+ err = br_multicast_set_mld_version(br, mld_version);
+ if (err)
+ return err;
+ }
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
@@ -1087,6 +1101,25 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
return 0;
}
+static int br_dev_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net_bridge *br = netdev_priv(dev);
+ int err;
+
+ if (tb[IFLA_ADDRESS]) {
+ spin_lock_bh(&br->lock);
+ br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
+ spin_unlock_bh(&br->lock);
+ }
+
+ err = br_changelink(dev, tb, data);
+ if (err)
+ return err;
+
+ return register_netdevice(dev);
+}
+
static size_t br_get_size(const struct net_device *brdev)
{
return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */
@@ -1129,6 +1162,8 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1204,9 +1239,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
br->multicast_last_member_count) ||
nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
- br->multicast_startup_query_count))
+ br->multicast_startup_query_count) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
+ br->multicast_igmp_version))
return -EMSGSIZE;
-
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
+ br->multicast_mld_version))
+ return -EMSGSIZE;
+#endif
clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
IFLA_BR_PAD))
@@ -1245,14 +1286,30 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
return 0;
}
-static size_t bridge_get_linkxstats_size(const struct net_device *dev)
+static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
{
- struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p = NULL;
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
+ struct net_bridge *br;
int numvls = 0;
- vg = br_vlan_group(br);
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return 0;
+ br = p->br;
+ vg = nbp_vlan_group(p);
+ break;
+ default:
+ return 0;
+ }
+
if (vg) {
/* we need to count all, even placeholder entries */
list_for_each_entry(v, &vg->vlan_list, vlist)
@@ -1264,45 +1321,42 @@ static size_t bridge_get_linkxstats_size(const struct net_device *dev)
nla_total_size(0);
}
-static size_t brport_get_linkxstats_size(const struct net_device *dev)
-{
- return nla_total_size(sizeof(struct br_mcast_stats)) +
- nla_total_size(0);
-}
-
-static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
+static int br_fill_linkxstats(struct sk_buff *skb,
+ const struct net_device *dev,
+ int *prividx, int attr)
{
- size_t retsize = 0;
+ struct nlattr *nla __maybe_unused;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge *br;
+ struct nlattr *nest;
+ int vl_idx = 0;
switch (attr) {
case IFLA_STATS_LINK_XSTATS:
- retsize = bridge_get_linkxstats_size(dev);
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
break;
case IFLA_STATS_LINK_XSTATS_SLAVE:
- retsize = brport_get_linkxstats_size(dev);
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return 0;
+ br = p->br;
+ vg = nbp_vlan_group(p);
break;
+ default:
+ return -EINVAL;
}
- return retsize;
-}
-
-static int bridge_fill_linkxstats(struct sk_buff *skb,
- const struct net_device *dev,
- int *prividx)
-{
- struct net_bridge *br = netdev_priv(dev);
- struct nlattr *nla __maybe_unused;
- struct net_bridge_vlan_group *vg;
- struct net_bridge_vlan *v;
- struct nlattr *nest;
- int vl_idx = 0;
-
nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
if (!nest)
return -EMSGSIZE;
- vg = br_vlan_group(br);
if (vg) {
+ u16 pvid;
+
+ pvid = br_get_pvid(vg);
list_for_each_entry(v, &vg->vlan_list, vlist) {
struct bridge_vlan_xstats vxi;
struct br_vlan_stats stats;
@@ -1311,6 +1365,9 @@ static int bridge_fill_linkxstats(struct sk_buff *skb,
continue;
memset(&vxi, 0, sizeof(vxi));
vxi.vid = v->vid;
+ vxi.flags = v->flags;
+ if (v->vid == pvid)
+ vxi.flags |= BRIDGE_VLAN_INFO_PVID;
br_vlan_get_stats(v, &stats);
vxi.rx_bytes = stats.rx_bytes;
vxi.rx_packets = stats.rx_packets;
@@ -1329,7 +1386,7 @@ static int bridge_fill_linkxstats(struct sk_buff *skb,
BRIDGE_XSTATS_PAD);
if (!nla)
goto nla_put_failure;
- br_multicast_get_stats(br, NULL, nla_data(nla));
+ br_multicast_get_stats(br, p, nla_data(nla));
}
#endif
nla_nest_end(skb, nest);
@@ -1344,52 +1401,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int brport_fill_linkxstats(struct sk_buff *skb,
- const struct net_device *dev,
- int *prividx)
-{
- struct net_bridge_port *p = br_port_get_rtnl(dev);
- struct nlattr *nla __maybe_unused;
- struct nlattr *nest;
-
- if (!p)
- return 0;
-
- nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
- if (!nest)
- return -EMSGSIZE;
-#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
- sizeof(struct br_mcast_stats),
- BRIDGE_XSTATS_PAD);
- if (!nla) {
- nla_nest_end(skb, nest);
- return -EMSGSIZE;
- }
- br_multicast_get_stats(p->br, p, nla_data(nla));
-#endif
- nla_nest_end(skb, nest);
-
- return 0;
-}
-
-static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
- int *prividx, int attr)
-{
- int ret = -EINVAL;
-
- switch (attr) {
- case IFLA_STATS_LINK_XSTATS:
- ret = bridge_fill_linkxstats(skb, dev, prividx);
- break;
- case IFLA_STATS_LINK_XSTATS_SLAVE:
- ret = brport_fill_linkxstats(skb, dev, prividx);
- break;
- }
-
- return ret;
-}
-
static struct rtnl_af_ops br_af_ops __read_mostly = {
.family = AF_BRIDGE,
.get_link_af_size = br_get_link_af_size_filtered,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aac2a6e6b008..8ce621e8345c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -251,6 +251,9 @@ struct net_bridge_port
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
+#ifdef CONFIG_NET_SWITCHDEV
+ int offload_fwd_mark;
+#endif
};
#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
@@ -297,10 +300,11 @@ struct net_bridge
unsigned long max_age;
unsigned long hello_time;
unsigned long forward_delay;
- unsigned long bridge_max_age;
unsigned long ageing_time;
+ unsigned long bridge_max_age;
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
+ unsigned long bridge_ageing_time;
u8 group_addr[ETH_ALEN];
bool group_addr_set;
@@ -330,6 +334,8 @@ struct net_bridge
u32 multicast_last_member_count;
u32 multicast_startup_query_count;
+ u8 multicast_igmp_version;
+
unsigned long multicast_last_member_interval;
unsigned long multicast_membership_interval;
unsigned long multicast_querier_interval;
@@ -350,6 +356,7 @@ struct net_bridge
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
+ u8 multicast_mld_version;
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif
@@ -359,6 +366,11 @@ struct net_bridge
struct timer_list gc_timer;
struct kobject *ifobj;
u32 auto_cnt;
+
+#ifdef CONFIG_NET_SWITCHDEV
+ int offload_fwd_mark;
+#endif
+
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_bridge_vlan_group __rcu *vlgrp;
u8 vlan_enabled;
@@ -381,6 +393,10 @@ struct br_input_skb_cb {
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool vlan_filtered;
#endif
+
+#ifdef CONFIG_NET_SWITCHDEV
+ int offload_fwd_mark;
+#endif
};
#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb)
@@ -496,7 +512,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
const unsigned char *addr, u16 vid, u16 nlh_flags);
int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct net_device *dev, struct net_device *fdev, int idx);
+ struct net_device *dev, struct net_device *fdev, int *idx);
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
@@ -505,12 +521,17 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid);
/* br_forward.c */
+enum br_pkt_type {
+ BR_PKT_UNICAST,
+ BR_PKT_MULTICAST,
+ BR_PKT_BROADCAST
+};
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
bool local_rcv, bool local_orig);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood(struct net_bridge *br, struct sk_buff *skb,
- bool unicast, bool local_rcv, bool local_orig);
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
/* br_if.c */
void br_port_carrier_check(struct net_bridge_port *p);
@@ -565,6 +586,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
int br_multicast_toggle(struct net_bridge *br, unsigned long val);
int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
+#endif
struct net_bridge_mdb_entry *
br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
struct net_bridge_mdb_entry *
@@ -975,6 +1000,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
int br_set_forward_delay(struct net_bridge *br, unsigned long x);
int br_set_hello_time(struct net_bridge *br, unsigned long x);
int br_set_max_age(struct net_bridge *br, unsigned long x);
+int __set_ageing_time(struct net_device *dev, unsigned long t);
int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
@@ -1034,4 +1060,29 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
static inline void br_sysfs_delbr(struct net_device *dev) { return; }
#endif /* CONFIG_SYSFS */
+/* br_switchdev.c */
+#ifdef CONFIG_NET_SWITCHDEV
+int nbp_switchdev_mark_set(struct net_bridge_port *p);
+void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+ struct sk_buff *skb);
+bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+ const struct sk_buff *skb);
+#else
+static inline int nbp_switchdev_mark_set(struct net_bridge_port *p)
+{
+ return 0;
+}
+
+static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
+{
+ return true;
+}
+#endif /* CONFIG_NET_SWITCHDEV */
+
#endif
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
index 2fe910c4e170..3f7543a29b76 100644
--- a/net/bridge/br_private_stp.h
+++ b/net/bridge/br_private_stp.h
@@ -61,6 +61,7 @@ void br_received_tcn_bpdu(struct net_bridge_port *p);
void br_transmit_config(struct net_bridge_port *p);
void br_transmit_tcn(struct net_bridge *br);
void br_topology_change_detection(struct net_bridge *br);
+void __br_set_topology_change(struct net_bridge *br, unsigned char val);
/* br_stp_bpdu.c */
void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 9258b8ef14ff..71fd1a4e63cc 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -234,7 +234,7 @@ static void br_record_config_timeout_values(struct net_bridge *br,
br->max_age = bpdu->max_age;
br->hello_time = bpdu->hello_time;
br->forward_delay = bpdu->forward_delay;
- br->topology_change = bpdu->topology_change;
+ __br_set_topology_change(br, bpdu->topology_change);
}
/* called under bridge lock */
@@ -344,7 +344,7 @@ void br_topology_change_detection(struct net_bridge *br)
isroot ? "propagating" : "sending tcn bpdu");
if (isroot) {
- br->topology_change = 1;
+ __br_set_topology_change(br, 1);
mod_timer(&br->topology_change_timer, jiffies
+ br->bridge_forward_delay + br->bridge_max_age);
} else if (!br->topology_change_detected) {
@@ -562,6 +562,24 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
}
+/* called under bridge lock */
+int __set_ageing_time(struct net_device *dev, unsigned long t)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
+ .u.ageing_time = jiffies_to_clock_t(t),
+ };
+ int err;
+
+ err = switchdev_port_attr_set(dev, &attr);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
/* Set time interval that dynamic forwarding entries live
* For pure software bridge, allow values outside the 802.1
* standard specification for special cases:
@@ -572,25 +590,52 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
*/
int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
{
- struct switchdev_attr attr = {
- .orig_dev = br->dev,
- .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
- .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
- .u.ageing_time = ageing_time,
- };
unsigned long t = clock_t_to_jiffies(ageing_time);
int err;
- err = switchdev_port_attr_set(br->dev, &attr);
- if (err && err != -EOPNOTSUPP)
+ err = __set_ageing_time(br->dev, t);
+ if (err)
return err;
+ spin_lock_bh(&br->lock);
+ br->bridge_ageing_time = t;
br->ageing_time = t;
+ spin_unlock_bh(&br->lock);
+
mod_timer(&br->gc_timer, jiffies);
return 0;
}
+/* called under bridge lock */
+void __br_set_topology_change(struct net_bridge *br, unsigned char val)
+{
+ unsigned long t;
+ int err;
+
+ if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) {
+ /* On topology change, set the bridge ageing time to twice the
+ * forward delay. Otherwise, restore its default ageing time.
+ */
+
+ if (val) {
+ t = 2 * br->forward_delay;
+ br_debug(br, "decreasing ageing time to %lu\n", t);
+ } else {
+ t = br->bridge_ageing_time;
+ br_debug(br, "restoring ageing time to %lu\n", t);
+ }
+
+ err = __set_ageing_time(br->dev, t);
+ if (err)
+ br_warn(br, "error offloading ageing time\n");
+ else
+ br->ageing_time = t;
+ }
+
+ br->topology_change = val;
+}
+
void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
{
br->bridge_forward_delay = t;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 341caa0ca63a..6c1e21411125 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -36,12 +36,6 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
/* called under bridge lock */
void br_init_port(struct net_bridge_port *p)
{
- struct switchdev_attr attr = {
- .orig_dev = p->dev,
- .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
- .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
- .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
- };
int err;
p->port_id = br_make_port_id(p->priority, p->port_no);
@@ -50,9 +44,9 @@ void br_init_port(struct net_bridge_port *p)
p->topology_change_ack = 0;
p->config_pending = 0;
- err = switchdev_port_attr_set(p->dev, &attr);
- if (err && err != -EOPNOTSUPP)
- netdev_err(p->dev, "failed to set HW ageing time\n");
+ err = __set_ageing_time(p->dev, p->br->ageing_time);
+ if (err)
+ netdev_err(p->dev, "failed to offload ageing time\n");
}
/* NO locks held */
@@ -87,7 +81,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
}
- br->topology_change = 0;
+ __br_set_topology_change(br, 0);
br->topology_change_detected = 0;
spin_unlock_bh(&br->lock);
@@ -134,17 +128,36 @@ void br_stp_disable_port(struct net_bridge_port *p)
br_become_root_bridge(br);
}
-static void br_stp_start(struct net_bridge *br)
+static int br_stp_call_user(struct net_bridge *br, char *arg)
{
- int r;
- char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
+ char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL };
char *envp[] = { NULL };
+ int rc;
+
+ /* call userspace STP and report program errors */
+ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ if (rc > 0) {
+ if (rc & 0xff)
+ br_debug(br, BR_STP_PROG " received signal %d\n",
+ rc & 0x7f);
+ else
+ br_debug(br, BR_STP_PROG " exited with code %d\n",
+ (rc >> 8) & 0xff);
+ }
+
+ return rc;
+}
+
+static void br_stp_start(struct net_bridge *br)
+{
struct net_bridge_port *p;
+ int err = -ENOENT;
if (net_eq(dev_net(br->dev), &init_net))
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
- else
- r = -ENOENT;
+ err = br_stp_call_user(br, "start");
+
+ if (err && err != -ENOENT)
+ br_err(br, "failed to start userspace STP (%d)\n", err);
spin_lock_bh(&br->lock);
@@ -153,9 +166,10 @@ static void br_stp_start(struct net_bridge *br)
else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
__br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
- if (r == 0) {
+ if (!err) {
br->stp_enabled = BR_USER_STP;
br_debug(br, "userspace STP started\n");
+
/* Stop hello and hold timers */
del_timer(&br->hello_timer);
list_for_each_entry(p, &br->port_list, list)
@@ -173,14 +187,13 @@ static void br_stp_start(struct net_bridge *br)
static void br_stp_stop(struct net_bridge *br)
{
- int r;
- char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
- char *envp[] = { NULL };
struct net_bridge_port *p;
+ int err;
if (br->stp_enabled == BR_USER_STP) {
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
- br_info(br, "userspace STP stopped, return code %d\n", r);
+ err = br_stp_call_user(br, "stop");
+ if (err)
+ br_err(br, "failed to stop userspace STP (%d)\n", err);
/* To start timers on any ports left in blocking */
mod_timer(&br->hello_timer, jiffies + br->hello_time);
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index da058b85aa22..7ddb38e0a06e 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -125,7 +125,7 @@ static void br_topology_change_timer_expired(unsigned long arg)
br_debug(br, "topo change timer expired\n");
spin_lock(&br->lock);
br->topology_change_detected = 0;
- br->topology_change = 0;
+ __br_set_topology_change(br, 0);
spin_unlock(&br->lock);
}
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
new file mode 100644
index 000000000000..f4097b900de1
--- /dev/null
+++ b/net/bridge/br_switchdev.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+
+static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev)
+{
+ struct net_bridge_port *p;
+
+ /* dev is yet to be added to the port list. */
+ list_for_each_entry(p, &br->port_list, list) {
+ if (switchdev_port_same_parent_id(dev, p->dev))
+ return p->offload_fwd_mark;
+ }
+
+ return ++br->offload_fwd_mark;
+}
+
+int nbp_switchdev_mark_set(struct net_bridge_port *p)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ };
+ int err;
+
+ ASSERT_RTNL();
+
+ err = switchdev_port_attr_get(p->dev, &attr);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev);
+
+ return 0;
+}
+
+void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark))
+ BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark;
+}
+
+bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
+{
+ return !skb->offload_fwd_mark ||
+ BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark;
+}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index e120307c6e36..a18148213b08 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -440,6 +440,23 @@ static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(hash_max);
+static ssize_t multicast_igmp_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_igmp_version);
+}
+
+static ssize_t multicast_igmp_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version);
+}
+static DEVICE_ATTR_RW(multicast_igmp_version);
+
static ssize_t multicast_last_member_count_show(struct device *d,
struct device_attribute *attr,
char *buf)
@@ -642,6 +659,25 @@ static ssize_t multicast_stats_enabled_store(struct device *d,
return store_bridge_parm(d, buf, len, set_stats_enabled);
}
static DEVICE_ATTR_RW(multicast_stats_enabled);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static ssize_t multicast_mld_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_mld_version);
+}
+
+static ssize_t multicast_mld_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_mld_version);
+}
+static DEVICE_ATTR_RW(multicast_mld_version);
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static ssize_t nf_call_iptables_show(
@@ -809,6 +845,10 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_query_response_interval.attr,
&dev_attr_multicast_startup_query_interval.attr,
&dev_attr_multicast_stats_enabled.attr,
+ &dev_attr_multicast_igmp_version.attr,
+#if IS_ENABLED(CONFIG_IPV6)
+ &dev_attr_multicast_mld_version.attr,
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
&dev_attr_nf_call_iptables.attr,
@@ -898,6 +938,7 @@ int br_sysfs_addbr(struct net_device *dev)
if (!br->ifobj) {
pr_info("%s: can't add kobject (directory) %s/%s\n",
__func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR);
+ err = -ENOMEM;
goto out3;
}
return 0;
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 1e04d4d44273..8bd569695e76 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(learning, BR_LEARNING);
BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
+BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -216,6 +217,7 @@ static const struct brport_attribute *brport_attrs[] = {
#endif
&brport_attr_proxyarp,
&brport_attr_proxyarp_wifi,
+ &brport_attr_multicast_flood,
NULL
};
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 9cebf47ac840..e7ef1a1ef3a6 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -22,6 +22,7 @@ config NFT_BRIDGE_REJECT
config NF_LOG_BRIDGE
tristate "Bridge packet logging"
+ select NF_LOG_COMMON
endif # NF_TABLES_BRIDGE
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 070cf134a22f..5929309beaa1 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -51,7 +51,8 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (diptr == NULL)
return EBT_DROP;
- arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in,
+ arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr,
+ (struct net_device *)xt_in(par),
*diptr, shp, info->mac, shp);
return info->target;
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 152300d164ac..e88bd4827ac1 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
if (loginfo->type == NF_LOG_TYPE_LOG)
bitmask = loginfo->u.log.logflags;
else
- bitmask = NF_LOG_MASK;
+ bitmask = NF_LOG_DEFAULT_MASK;
if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
htons(ETH_P_IP)) {
@@ -179,7 +179,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_log_info *info = par->targinfo;
struct nf_loginfo li;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = info->loglevel;
@@ -190,11 +190,12 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
* nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo
*/
if (info->bitmask & EBT_LOG_NFLOG)
- nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
- par->in, par->out, &li, "%s", info->prefix);
+ nf_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, "%s",
+ info->prefix);
else
- ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in,
- par->out, &li, info->prefix);
+ ebt_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, info->prefix);
return EBT_CONTINUE;
}
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 54816150608e..c1dc48686200 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -23,16 +23,16 @@ static unsigned int
ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nflog_info *info = par->targinfo;
+ struct net *net = xt_net(par);
struct nf_loginfo li;
- struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
- nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in,
- par->out, &li, "%s", info->prefix);
+ nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", info->prefix);
return EBT_CONTINUE;
}
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 203964997a51..8d2a85e0594e 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -23,12 +23,12 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (!skb_make_writable(skb, 0))
return EBT_DROP;
- if (par->hooknum != NF_BR_BROUTING)
- /* rcu_read_lock()ed by nf_hook_slow */
+ if (xt_hooknum(par) != NF_BR_BROUTING)
+ /* rcu_read_lock()ed by nf_hook_thresh */
ether_addr_copy(eth_hdr(skb)->h_dest,
- br_port_get_rcu(par->in)->br->dev->dev_addr);
+ br_port_get_rcu(xt_in(par))->br->dev->dev_addr);
else
- ether_addr_copy(eth_hdr(skb)->h_dest, par->in->dev_addr);
+ ether_addr_copy(eth_hdr(skb)->h_dest, xt_in(par)->dev_addr);
skb->pkt_type = PACKET_HOST;
return info->target;
}
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index ec94c6f1ae88..8fe36dc3aab2 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -53,7 +53,7 @@ static int ebt_broute(struct sk_buff *skb)
struct nf_hook_state state;
int ret;
- nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN,
+ nf_hook_state_init(&state, NF_BR_BROUTING,
NFPROTO_BRIDGE, skb->dev, NULL, NULL,
dev_net(skb->dev), NULL);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0833c251aef7..537e3d506fc2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -23,7 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/audit.h>
@@ -146,7 +146,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
return 1;
if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
return 1;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
if (in && (p = br_port_get_rcu(in)) != NULL &&
NF_INVF(e, EBT_ILOGICALIN,
ebt_dev_check(e->logical_in, p->br->dev)))
@@ -194,12 +194,8 @@ unsigned int ebt_do_table(struct sk_buff *skb,
const struct ebt_table_info *private;
struct xt_action_param acpar;
- acpar.family = NFPROTO_BRIDGE;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
+ acpar.state = state;
acpar.hotdrop = false;
- acpar.hooknum = hook;
read_lock_bh(&table->lock);
private = table->private;
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index 5d9953a90929..bd2b3c78f59b 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -24,21 +24,8 @@ static void nf_log_bridge_packet(struct net *net, u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- switch (eth_hdr(skb)->h_proto) {
- case htons(ETH_P_IP):
- nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_IPV6):
- nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_ARP):
- case htons(ETH_P_RARP):
- nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- }
+ nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb,
+ in, out, loginfo, prefix);
}
static struct nf_logger nf_bridge_logger __read_mostly = {
@@ -50,8 +37,7 @@ static struct nf_logger nf_bridge_logger __read_mostly = {
static int __net_init nf_log_bridge_net_init(struct net *net)
{
- nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
- return 0;
+ return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
}
static void __net_exit nf_log_bridge_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index a78c4e2826e5..97afdc0744e6 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -13,79 +13,11 @@
#include <linux/module.h>
#include <linux/netfilter_bridge.h>
#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_bridge.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/netfilter/nf_tables_ipv4.h>
#include <net/netfilter/nf_tables_ipv6.h>
-int nft_bridge_iphdr_validate(struct sk_buff *skb)
-{
- struct iphdr *iph;
- u32 len;
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return 0;
-
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
- return 0;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len)
- return 0;
- else if (len < (iph->ihl*4))
- return 0;
-
- if (!pskb_may_pull(skb, iph->ihl*4))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate);
-
-int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
-{
- struct ipv6hdr *hdr;
- u32 pkt_len;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- return 0;
-
- hdr = ipv6_hdr(skb);
- if (hdr->version != 6)
- return 0;
-
- pkt_len = ntohs(hdr->payload_len);
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate);
-
-static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- if (nft_bridge_iphdr_validate(skb))
- nft_set_pktinfo_ipv4(pkt, skb, state);
- else
- nft_set_pktinfo(pkt, skb, state);
-}
-
-static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_IPV6)
- if (nft_bridge_ip6hdr_validate(skb) &&
- nft_set_pktinfo_ipv6(pkt, skb, state) == 0)
- return;
-#endif
- nft_set_pktinfo(pkt, skb, state);
-}
-
static unsigned int
nft_do_chain_bridge(void *priv,
struct sk_buff *skb,
@@ -95,13 +27,13 @@ nft_do_chain_bridge(void *priv,
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
- nft_bridge_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
break;
case htons(ETH_P_IPV6):
- nft_bridge_set_pktinfo_ipv6(&pkt, skb, state);
+ nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
break;
default:
- nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb, state);
break;
}
@@ -207,12 +139,20 @@ static int __init nf_tables_bridge_init(void)
int ret;
nf_register_afinfo(&nf_br_afinfo);
- nft_register_chain_type(&filter_bridge);
+ ret = nft_register_chain_type(&filter_bridge);
+ if (ret < 0)
+ goto err1;
+
ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
- if (ret < 0) {
- nft_unregister_chain_type(&filter_bridge);
- nf_unregister_afinfo(&nf_br_afinfo);
- }
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ nft_unregister_chain_type(&filter_bridge);
+err1:
+ nf_unregister_afinfo(&nf_br_afinfo);
return ret;
}
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index ad47a921b701..5974dbc1ea24 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -23,7 +23,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
- const struct net_device *in = pkt->in, *out = pkt->out;
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
u32 *dest = &regs->data[priv->dreg];
const struct net_bridge_port *p;
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 0b77ffbc27d6..206dc266ecd2 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -14,7 +14,6 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_reject.h>
-#include <net/netfilter/nf_tables_bridge.h>
#include <net/netfilter/ipv4/nf_reject.h>
#include <net/netfilter/ipv6/nf_reject.h>
#include <linux/ip.h>
@@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
skb_pull(nskb, ETH_HLEN);
}
+static int nft_bridge_iphdr_validate(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return 0;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ return 0;
+ else if (len < (iph->ihl*4))
+ return 0;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ return 0;
+
+ return 1;
+}
+
/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
* or the bridge port (NF_BRIDGE PREROUTING).
*/
@@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
br_forward(br_port_get_rcu(dev), nskb, false, true);
}
+static int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+
static void nft_reject_br_send_v6_tcp_reset(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
@@ -273,17 +315,20 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
case htons(ETH_P_IP):
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nft_reject_br_send_v4_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
- nft_reject_br_send_v4_tcp_reset(pkt->net, pkt->skb,
- pkt->in, pkt->hook);
+ nft_reject_br_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
- nft_reject_br_send_v4_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
nft_reject_icmp_code(priv->icmp_code));
break;
}
@@ -291,17 +336,20 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
case htons(ETH_P_IPV6):
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nft_reject_br_send_v6_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
- nft_reject_br_send_v6_tcp_reset(pkt->net, pkt->skb,
- pkt->in, pkt->hook);
+ nft_reject_br_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
- nft_reject_br_send_v6_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
nft_reject_icmpv6_code(priv->icmp_code));
break;
}
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index d730a0f68f46..2d38b6e34203 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -52,7 +52,7 @@ struct caif_net {
struct caif_device_entry_list caifdevs;
};
-static int caif_net_id;
+static unsigned int caif_net_id;
static int q_high = 50; /* Percent */
struct cfcnfg *get_cfcnfg(struct net *net)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index aa209b1066c9..92cbbd2afddb 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1107,10 +1107,7 @@ static struct net_proto_family caif_family_ops = {
static int __init caif_sktinit_module(void)
{
- int err = sock_register(&caif_family_ops);
- if (!err)
- return err;
- return 0;
+ return sock_register(&caif_family_ops);
}
static void __exit caif_sktexit_module(void)
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index fa39fc298708..273cb07f57d8 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -390,8 +390,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
rcu_read_lock();
if (adapt_layer == NULL) {
- pr_debug("link setup response but no client exist,"
- "send linkdown back\n");
+ pr_debug("link setup response but no client exist, send linkdown back\n");
cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
goto unlock;
}
@@ -401,8 +400,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
if (phyinfo == NULL) {
- pr_err("ERROR: Link Layer Device disappeared"
- "while connecting\n");
+ pr_err("ERROR: Link Layer Device disappeared while connecting\n");
goto unlock;
}
@@ -436,8 +434,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
break;
default:
- pr_err("Protocol error. Link setup response "
- "- unknown channel type\n");
+ pr_err("Protocol error. Link setup response - unknown channel type\n");
goto unlock;
}
if (!servicel)
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1108079d934f..5488e4a6ccd0 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -445,6 +445,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
* @func: callback function on filter match
* @data: returned parameter for callback function
* @ident: string for calling module identification
+ * @sk: socket pointer (might be NULL)
*
* Description:
* Invokes the callback function with the received sk_buff and the given
@@ -468,7 +469,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
*/
int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
void (*func)(struct sk_buff *, void *), void *data,
- char *ident)
+ char *ident, struct sock *sk)
{
struct receiver *r;
struct hlist_head *rl;
@@ -496,6 +497,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
r->func = func;
r->data = data;
r->ident = ident;
+ r->sk = sk;
hlist_add_head_rcu(&r->list, rl);
d->entries++;
@@ -520,8 +522,11 @@ EXPORT_SYMBOL(can_rx_register);
static void can_rx_delete_receiver(struct rcu_head *rp)
{
struct receiver *r = container_of(rp, struct receiver, rcu);
+ struct sock *sk = r->sk;
kmem_cache_free(rcv_cache, r);
+ if (sk)
+ sock_put(sk);
}
/**
@@ -596,8 +601,11 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
spin_unlock(&can_rcvlists_lock);
/* schedule the receiver item for deletion */
- if (r)
+ if (r) {
+ if (r->sk)
+ sock_hold(r->sk);
call_rcu(&r->rcu, can_rx_delete_receiver);
+ }
}
EXPORT_SYMBOL(can_rx_unregister);
diff --git a/net/can/af_can.h b/net/can/af_can.h
index fca0fe9fc45a..b86f5129e838 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -50,13 +50,14 @@
struct receiver {
struct hlist_node list;
- struct rcu_head rcu;
canid_t can_id;
canid_t mask;
unsigned long matches;
void (*func)(struct sk_buff *, void *);
void *data;
char *ident;
+ struct sock *sk;
+ struct rcu_head rcu;
};
#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 8e999ffdf28b..95d13b233c65 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -77,7 +77,7 @@
(CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
(CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
-#define CAN_BCM_VERSION "20160617"
+#define CAN_BCM_VERSION "20161123"
MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
@@ -109,8 +109,9 @@ struct bcm_op {
u32 count;
u32 nframes;
u32 currframe;
- struct canfd_frame *frames;
- struct canfd_frame *last_frames;
+ /* void pointers to arrays of struct can[fd]_frame */
+ void *frames;
+ void *last_frames;
struct canfd_frame sframe;
struct canfd_frame last_sframe;
struct sock *sk;
@@ -198,11 +199,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
- if (op->kt_ival1.tv64)
+ if (op->kt_ival1)
seq_printf(m, "timeo=%lld ",
(long long)ktime_to_us(op->kt_ival1));
- if (op->kt_ival2.tv64)
+ if (op->kt_ival2)
seq_printf(m, "thr=%lld ",
(long long)ktime_to_us(op->kt_ival2));
@@ -225,11 +226,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
else
seq_printf(m, "[%u] ", op->nframes);
- if (op->kt_ival1.tv64)
+ if (op->kt_ival1)
seq_printf(m, "t1=%lld ",
(long long)ktime_to_us(op->kt_ival1));
- if (op->kt_ival2.tv64)
+ if (op->kt_ival2)
seq_printf(m, "t2=%lld ",
(long long)ktime_to_us(op->kt_ival2));
@@ -364,11 +365,11 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
static void bcm_tx_start_timer(struct bcm_op *op)
{
- if (op->kt_ival1.tv64 && op->count)
+ if (op->kt_ival1 && op->count)
hrtimer_start(&op->timer,
ktime_add(ktime_get(), op->kt_ival1),
HRTIMER_MODE_ABS);
- else if (op->kt_ival2.tv64)
+ else if (op->kt_ival2)
hrtimer_start(&op->timer,
ktime_add(ktime_get(), op->kt_ival2),
HRTIMER_MODE_ABS);
@@ -379,7 +380,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
struct bcm_op *op = (struct bcm_op *)data;
struct bcm_msg_head msg_head;
- if (op->kt_ival1.tv64 && (op->count > 0)) {
+ if (op->kt_ival1 && (op->count > 0)) {
op->count--;
if (!op->count && (op->flags & TX_COUNTEVT)) {
@@ -397,7 +398,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
}
bcm_can_tx(op);
- } else if (op->kt_ival2.tv64)
+ } else if (op->kt_ival2)
bcm_can_tx(op);
bcm_tx_start_timer(op);
@@ -458,7 +459,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
lastdata->flags |= (RX_RECV|RX_THR);
/* throttling mode inactive ? */
- if (!op->kt_ival2.tv64) {
+ if (!op->kt_ival2) {
/* send RX_CHANGED to the user immediately */
bcm_rx_changed(op, lastdata);
return;
@@ -469,7 +470,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
return;
/* first reception with enabled throttling mode */
- if (!op->kt_lastmsg.tv64)
+ if (!op->kt_lastmsg)
goto rx_changed_settime;
/* got a second frame inside a potential throttle period? */
@@ -536,7 +537,7 @@ static void bcm_rx_starttimer(struct bcm_op *op)
if (op->flags & RX_NO_AUTOTIMER)
return;
- if (op->kt_ival1.tv64)
+ if (op->kt_ival1)
hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
}
@@ -642,7 +643,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
} else {
/* rearm throttle handling */
- op->kt_lastmsg = ktime_set(0, 0);
+ op->kt_lastmsg = 0;
return HRTIMER_NORESTART;
}
}
@@ -681,7 +682,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
if (op->flags & RX_FILTER_ID) {
/* the easiest case */
- bcm_rx_update_and_send(op, &op->last_frames[0], rxframe);
+ bcm_rx_update_and_send(op, op->last_frames, rxframe);
goto rx_starttimer;
}
@@ -733,14 +734,23 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
static void bcm_remove_op(struct bcm_op *op)
{
- hrtimer_cancel(&op->timer);
- hrtimer_cancel(&op->thrtimer);
-
- if (op->tsklet.func)
- tasklet_kill(&op->tsklet);
+ if (op->tsklet.func) {
+ while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
+ test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
+ hrtimer_active(&op->timer)) {
+ hrtimer_cancel(&op->timer);
+ tasklet_kill(&op->tsklet);
+ }
+ }
- if (op->thrtsklet.func)
- tasklet_kill(&op->thrtsklet);
+ if (op->thrtsklet.func) {
+ while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
+ test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
+ hrtimer_active(&op->thrtimer)) {
+ hrtimer_cancel(&op->thrtimer);
+ tasklet_kill(&op->thrtsklet);
+ }
+ }
if ((op->frames) && (op->frames != &op->sframe))
kfree(op->frames);
@@ -1004,7 +1014,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
/* disable an active timer due to zero values? */
- if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64)
+ if (!op->kt_ival1 && !op->kt_ival2)
hrtimer_cancel(&op->timer);
}
@@ -1068,7 +1078,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->nframes) {
/* update CAN frames content */
- err = memcpy_from_msg((u8 *)op->frames, msg,
+ err = memcpy_from_msg(op->frames, msg,
msg_head->nframes * op->cfsiz);
if (err < 0)
return err;
@@ -1118,7 +1128,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
}
if (msg_head->nframes) {
- err = memcpy_from_msg((u8 *)op->frames, msg,
+ err = memcpy_from_msg(op->frames, msg,
msg_head->nframes * op->cfsiz);
if (err < 0) {
if (op->frames != &op->sframe)
@@ -1163,6 +1173,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
/* check flags */
if (op->flags & RX_RTR_FRAME) {
+ struct canfd_frame *frame0 = op->frames;
/* no timers in RTR-mode */
hrtimer_cancel(&op->thrtimer);
@@ -1174,8 +1185,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
* prevent a full-load-loopback-test ... ;-]
*/
if ((op->flags & TX_CP_CAN_ID) ||
- (op->frames[0].can_id == op->can_id))
- op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG;
+ (frame0->can_id == op->can_id))
+ frame0->can_id = op->can_id & ~CAN_RTR_FLAG;
} else {
if (op->flags & SETTIMER) {
@@ -1187,19 +1198,19 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
/* disable an active timer due to zero value? */
- if (!op->kt_ival1.tv64)
+ if (!op->kt_ival1)
hrtimer_cancel(&op->timer);
/*
* In any case cancel the throttle timer, flush
* potentially blocked msgs and reset throttle handling
*/
- op->kt_lastmsg = ktime_set(0, 0);
+ op->kt_lastmsg = 0;
hrtimer_cancel(&op->thrtimer);
bcm_rx_thr_flush(op, 1);
}
- if ((op->flags & STARTTIMER) && op->kt_ival1.tv64)
+ if ((op->flags & STARTTIMER) && op->kt_ival1)
hrtimer_start(&op->timer, op->kt_ival1,
HRTIMER_MODE_REL);
}
@@ -1214,7 +1225,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
err = can_rx_register(dev, op->can_id,
REGMASK(op->can_id),
bcm_rx_handler, op,
- "bcm");
+ "bcm", sk);
op->rx_reg_dev = dev;
dev_put(dev);
@@ -1223,7 +1234,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
} else
err = can_rx_register(NULL, op->can_id,
REGMASK(op->can_id),
- bcm_rx_handler, op, "bcm");
+ bcm_rx_handler, op, "bcm", sk);
if (err) {
/* this bcm rx op is broken -> remove it */
list_del(&op->list);
@@ -1549,24 +1560,31 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
struct sock *sk = sock->sk;
struct bcm_sock *bo = bcm_sk(sk);
+ int ret = 0;
if (len < sizeof(*addr))
return -EINVAL;
- if (bo->bound)
- return -EISCONN;
+ lock_sock(sk);
+
+ if (bo->bound) {
+ ret = -EISCONN;
+ goto fail;
+ }
/* bind a device to this socket */
if (addr->can_ifindex) {
struct net_device *dev;
dev = dev_get_by_index(&init_net, addr->can_ifindex);
- if (!dev)
- return -ENODEV;
-
+ if (!dev) {
+ ret = -ENODEV;
+ goto fail;
+ }
if (dev->type != ARPHRD_CAN) {
dev_put(dev);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
bo->ifindex = dev->ifindex;
@@ -1577,17 +1595,24 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
bo->ifindex = 0;
}
- bo->bound = 1;
-
if (proc_dir) {
/* unique socket address as filename */
sprintf(bo->procname, "%lu", sock_i_ino(sk));
bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
proc_dir,
&bcm_proc_fops, sk);
+ if (!bo->bcm_proc_read) {
+ ret = -ENOMEM;
+ goto fail;
+ }
}
- return 0;
+ bo->bound = 1;
+
+fail:
+ release_sock(sk);
+
+ return ret;
}
static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/can/gw.c b/net/can/gw.c
index 455168718c2e..7056a1a2bb70 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -429,7 +429,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
/* clear the skb timestamp if not configured the other way */
if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
- nskb->tstamp.tv64 = 0;
+ nskb->tstamp = 0;
/* send to netdevice */
if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
@@ -442,7 +442,7 @@ static inline int cgw_register_filter(struct cgw_job *gwj)
{
return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id,
gwj->ccgw.filter.can_mask, can_can_gw_rcv,
- gwj, "gw");
+ gwj, "gw", NULL);
}
static inline void cgw_unregister_filter(struct cgw_job *gwj)
diff --git a/net/can/raw.c b/net/can/raw.c
index 972c187d40ab..6dc546a06673 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -190,7 +190,7 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk,
for (i = 0; i < count; i++) {
err = can_rx_register(dev, filter[i].can_id,
filter[i].can_mask,
- raw_rcv, sk, "raw");
+ raw_rcv, sk, "raw", sk);
if (err) {
/* clean up successfully registered filters */
while (--i >= 0)
@@ -211,7 +211,7 @@ static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
if (err_mask)
err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
- raw_rcv, sk, "raw");
+ raw_rcv, sk, "raw", sk);
return err;
}
@@ -499,6 +499,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
if (optlen % sizeof(struct can_filter) != 0)
return -EINVAL;
+ if (optlen > CAN_RAW_FILTER_MAX * sizeof(struct can_filter))
+ return -EINVAL;
+
count = optlen / sizeof(struct can_filter);
if (count > 1) {
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 84cbed630c4b..6a5180903e7b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_CEPH_LIB) += libceph.o
libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
mon_client.o \
+ cls_lock_client.o \
osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
debugfs.o \
auth.o auth_none.o \
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 2bc5965fdd1e..48bb8d95195b 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -82,7 +82,10 @@ void ceph_auth_reset(struct ceph_auth_client *ac)
mutex_unlock(&ac->mutex);
}
-int ceph_entity_name_encode(const char *name, void **p, void *end)
+/*
+ * EntityName, not to be confused with entity_name_t
+ */
+int ceph_auth_entity_name_encode(const char *name, void **p, void *end)
{
int len = strlen(name);
@@ -111,7 +114,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
monhdr->session_mon = cpu_to_le16(-1);
monhdr->session_mon_tid = 0;
- ceph_encode_32(&p, 0); /* no protocol, yet */
+ ceph_encode_32(&p, CEPH_AUTH_UNKNOWN); /* no protocol, yet */
lenp = p;
p += sizeof(u32);
@@ -124,7 +127,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
for (i = 0; i < num; i++)
ceph_encode_32(&p, supported_protocols[i]);
- ret = ceph_entity_name_encode(ac->name, &p, end);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
if (ret < 0)
goto out;
ceph_decode_need(&p, end, sizeof(u64), bad);
@@ -259,9 +262,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
int ret = 0;
mutex_lock(&ac->mutex);
- if (!ac->protocol)
- ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
- else if (ac->ops->should_authenticate(ac))
+ if (ac->ops->should_authenticate(ac))
ret = ceph_build_auth_request(ac, msg_buf, msg_len);
mutex_unlock(&ac->mutex);
return ret;
@@ -314,13 +315,13 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
EXPORT_SYMBOL(ceph_auth_update_authorizer);
int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len)
+ struct ceph_authorizer *a)
{
int ret = 0;
mutex_lock(&ac->mutex);
if (ac->ops && ac->ops->verify_authorizer_reply)
- ret = ac->ops->verify_authorizer_reply(ac, a, len);
+ ret = ac->ops->verify_authorizer_reply(ac, a);
mutex_unlock(&ac->mutex);
return ret;
}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 5f836f02ae36..df45e467c81f 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -46,7 +46,7 @@ static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac,
int ret;
ceph_encode_8_safe(&p, end, 1, e_range);
- ret = ceph_entity_name_encode(ac->name, &p, end);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
if (ret < 0)
return ret;
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a0905f04bd13..2034fb926670 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -39,56 +39,58 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
return need != 0;
}
+static int ceph_x_encrypt_offset(void)
+{
+ return sizeof(u32) + sizeof(struct ceph_x_encrypt_header);
+}
+
static int ceph_x_encrypt_buflen(int ilen)
{
- return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
- sizeof(u32);
+ return ceph_x_encrypt_offset() + ilen + 16;
}
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
- void *ibuf, int ilen, void *obuf, size_t olen)
+static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf,
+ int buf_len, int plaintext_len)
{
- struct ceph_x_encrypt_header head = {
- .struct_v = 1,
- .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
- };
- size_t len = olen - sizeof(u32);
+ struct ceph_x_encrypt_header *hdr = buf + sizeof(u32);
+ int ciphertext_len;
int ret;
- ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
- &head, sizeof(head), ibuf, ilen);
+ hdr->struct_v = 1;
+ hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC);
+
+ ret = ceph_crypt(secret, true, buf + sizeof(u32), buf_len - sizeof(u32),
+ plaintext_len + sizeof(struct ceph_x_encrypt_header),
+ &ciphertext_len);
if (ret)
return ret;
- ceph_encode_32(&obuf, len);
- return len + sizeof(u32);
+
+ ceph_encode_32(&buf, ciphertext_len);
+ return sizeof(u32) + ciphertext_len;
}
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
- void **p, void *end, void **obuf, size_t olen)
+static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end)
{
- struct ceph_x_encrypt_header head;
- size_t head_len = sizeof(head);
- int len, ret;
-
- len = ceph_decode_32(p);
- if (*p + len > end)
- return -EINVAL;
+ struct ceph_x_encrypt_header *hdr = *p + sizeof(u32);
+ int ciphertext_len, plaintext_len;
+ int ret;
- dout("ceph_x_decrypt len %d\n", len);
- if (*obuf == NULL) {
- *obuf = kmalloc(len, GFP_NOFS);
- if (!*obuf)
- return -ENOMEM;
- olen = len;
- }
+ ceph_decode_32_safe(p, end, ciphertext_len, e_inval);
+ ceph_decode_need(p, end, ciphertext_len, e_inval);
- ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len);
+ ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len,
+ &plaintext_len);
if (ret)
return ret;
- if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+
+ if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC)
return -EPERM;
- *p += len;
- return olen;
+
+ *p += ciphertext_len;
+ return plaintext_len - sizeof(struct ceph_x_encrypt_header);
+
+e_inval:
+ return -EINVAL;
}
/*
@@ -143,13 +145,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
int type;
u8 tkt_struct_v, blob_struct_v;
struct ceph_x_ticket_handler *th;
- void *dbuf = NULL;
void *dp, *dend;
int dlen;
char is_enc;
struct timespec validity;
- struct ceph_crypto_key old_key;
- void *ticket_buf = NULL;
void *tp, *tpend;
void **ptp;
struct ceph_crypto_key new_session_key;
@@ -174,20 +173,17 @@ static int process_one_ticket(struct ceph_auth_client *ac,
}
/* blob for me */
- dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0);
- if (dlen <= 0) {
- ret = dlen;
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
goto out;
- }
- dout(" decrypted %d bytes\n", dlen);
- dp = dbuf;
- dend = dp + dlen;
+ dout(" decrypted %d bytes\n", ret);
+ dend = dp + ret;
tkt_struct_v = ceph_decode_8(&dp);
if (tkt_struct_v != 1)
goto bad;
- memcpy(&old_key, &th->session_key, sizeof(old_key));
ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
if (ret)
goto out;
@@ -203,15 +199,13 @@ static int process_one_ticket(struct ceph_auth_client *ac,
ceph_decode_8_safe(p, end, is_enc, bad);
if (is_enc) {
/* encrypted */
- dout(" encrypted ticket\n");
- dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0);
- if (dlen < 0) {
- ret = dlen;
+ tp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, end);
+ if (ret < 0)
goto out;
- }
- tp = ticket_buf;
+ dout(" encrypted ticket, decrypted %d bytes\n", ret);
ptp = &tp;
- tpend = *ptp + dlen;
+ tpend = tp + ret;
} else {
/* unencrypted */
ptp = p;
@@ -242,8 +236,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
xi->have_keys |= th->service;
out:
- kfree(ticket_buf);
- kfree(dbuf);
return ret;
bad:
@@ -294,7 +286,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
{
int maxlen;
struct ceph_x_authorize_a *msg_a;
- struct ceph_x_authorize_b msg_b;
+ struct ceph_x_authorize_b *msg_b;
void *p, *end;
int ret;
int ticket_blob_len =
@@ -308,8 +300,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
if (ret)
goto out_au;
- maxlen = sizeof(*msg_a) + sizeof(msg_b) +
- ceph_x_encrypt_buflen(ticket_blob_len);
+ maxlen = sizeof(*msg_a) + ticket_blob_len +
+ ceph_x_encrypt_buflen(sizeof(*msg_b));
dout(" need len %d\n", maxlen);
if (au->buf && au->buf->alloc_len < maxlen) {
ceph_buffer_put(au->buf);
@@ -343,18 +335,19 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
p += ticket_blob_len;
end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+ msg_b = p + ceph_x_encrypt_offset();
+ msg_b->struct_v = 1;
get_random_bytes(&au->nonce, sizeof(au->nonce));
- msg_b.struct_v = 1;
- msg_b.nonce = cpu_to_le64(au->nonce);
- ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
- p, end - p);
+ msg_b->nonce = cpu_to_le64(au->nonce);
+ ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
if (ret < 0)
goto out_au;
+
p += ret;
+ WARN_ON(p > end);
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
(int)au->buf->vec.iov_len);
- BUG_ON(au->buf->vec.iov_len > maxlen);
return 0;
out_au:
@@ -452,8 +445,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (need & CEPH_ENTITY_TYPE_AUTH) {
struct ceph_x_authenticate *auth = (void *)(head + 1);
void *p = auth + 1;
- struct ceph_x_challenge_blob tmp;
- char tmp_enc[40];
+ void *enc_buf = xi->auth_authorizer.enc_buf;
+ struct ceph_x_challenge_blob *blob = enc_buf +
+ ceph_x_encrypt_offset();
u64 *u;
if (p > end)
@@ -464,16 +458,16 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
/* encrypt and hash */
get_random_bytes(&auth->client_challenge, sizeof(u64));
- tmp.client_challenge = auth->client_challenge;
- tmp.server_challenge = cpu_to_le64(xi->server_challenge);
- ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
- tmp_enc, sizeof(tmp_enc));
+ blob->client_challenge = auth->client_challenge;
+ blob->server_challenge = cpu_to_le64(xi->server_challenge);
+ ret = ceph_x_encrypt(&xi->secret, enc_buf, CEPHX_AU_ENC_BUF_LEN,
+ sizeof(*blob));
if (ret < 0)
return ret;
auth->struct_v = 1;
auth->key = 0;
- for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+ for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
auth->key ^= *(__le64 *)u;
dout(" server_challenge %llx client_challenge %llx key %llx\n",
xi->server_challenge, le64_to_cpu(auth->client_challenge),
@@ -600,8 +594,8 @@ static int ceph_x_create_authorizer(
auth->authorizer = (struct ceph_authorizer *) au;
auth->authorizer_buf = au->buf->vec.iov_base;
auth->authorizer_buf_len = au->buf->vec.iov_len;
- auth->authorizer_reply_buf = au->reply_buf;
- auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+ auth->authorizer_reply_buf = au->enc_buf;
+ auth->authorizer_reply_buf_len = CEPHX_AU_ENC_BUF_LEN;
auth->sign_message = ac->ops->sign_message;
auth->check_message_signature = ac->ops->check_message_signature;
@@ -629,27 +623,25 @@ static int ceph_x_update_authorizer(
}
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len)
+ struct ceph_authorizer *a)
{
struct ceph_x_authorizer *au = (void *)a;
- int ret = 0;
- struct ceph_x_authorize_reply reply;
- void *preply = &reply;
- void *p = au->reply_buf;
- void *end = p + sizeof(au->reply_buf);
+ void *p = au->enc_buf;
+ struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+ int ret;
- ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply));
+ ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
if (ret < 0)
return ret;
- if (ret != sizeof(reply))
+ if (ret != sizeof(*reply))
return -EPERM;
- if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+ if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
ret = -EPERM;
else
ret = 0;
dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
- au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+ au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
return ret;
}
@@ -704,35 +696,48 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
}
-static int calcu_signature(struct ceph_x_authorizer *au,
- struct ceph_msg *msg, __le64 *sig)
+static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg,
+ __le64 *psig)
{
+ void *enc_buf = au->enc_buf;
+ struct {
+ __le32 len;
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 middle_crc;
+ __le32 data_crc;
+ } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
int ret;
- char tmp_enc[40];
- __le32 tmp[5] = {
- cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc,
- msg->footer.middle_crc, msg->footer.data_crc,
- };
- ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp),
- tmp_enc, sizeof(tmp_enc));
+
+ sigblock->len = cpu_to_le32(4*sizeof(u32));
+ sigblock->header_crc = msg->hdr.crc;
+ sigblock->front_crc = msg->footer.front_crc;
+ sigblock->middle_crc = msg->footer.middle_crc;
+ sigblock->data_crc = msg->footer.data_crc;
+ ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN,
+ sizeof(*sigblock));
if (ret < 0)
return ret;
- *sig = *(__le64*)(tmp_enc + 4);
+
+ *psig = *(__le64 *)(enc_buf + sizeof(u32));
return 0;
}
static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
struct ceph_msg *msg)
{
+ __le64 sig;
int ret;
if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
- ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
- msg, &msg->footer.sig);
- if (ret < 0)
+ ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig);
+ if (ret)
return ret;
+
+ msg->footer.sig = sig;
msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED;
return 0;
}
@@ -746,9 +751,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
- ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
- msg, &sig_check);
- if (ret < 0)
+ ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig_check);
+ if (ret)
return ret;
if (sig_check == msg->footer.sig)
return 0;
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 21a5af904bae..48e9ad41bd2a 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -24,6 +24,7 @@ struct ceph_x_ticket_handler {
unsigned long renew_after, expires;
};
+#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */
struct ceph_x_authorizer {
struct ceph_authorizer base;
@@ -32,7 +33,7 @@ struct ceph_x_authorizer {
unsigned int service;
u64 nonce;
u64 secret_id;
- char reply_buf[128]; /* big enough for encrypted blob */
+ char enc_buf[CEPHX_AU_ENC_BUF_LEN] __aligned(8);
};
struct ceph_x_info {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index bddfcf6f09c2..464e88599b9d 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -566,11 +566,17 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
}
EXPORT_SYMBOL(ceph_print_client_options);
-u64 ceph_client_id(struct ceph_client *client)
+struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client)
+{
+ return &client->msgr.inst.addr;
+}
+EXPORT_SYMBOL(ceph_client_addr);
+
+u64 ceph_client_gid(struct ceph_client *client)
{
return client->monc.auth->global_id;
}
-EXPORT_SYMBOL(ceph_client_id);
+EXPORT_SYMBOL(ceph_client_gid);
/*
* create a fresh client instance
@@ -685,7 +691,8 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return client->auth_err;
}
- pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid);
+ pr_info("client%llu fsid %pU\n", ceph_client_gid(client),
+ &client->fsid);
ceph_debugfs_client_init(client);
return 0;
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 7d54e944de5e..dcbe67ff3e2b 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -34,7 +34,8 @@ void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
fl->object_size = le32_to_cpu(legacy->fl_object_size);
fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
- if (fl->pool_id == 0)
+ if (fl->pool_id == 0 && fl->stripe_unit == 0 &&
+ fl->stripe_count == 0 && fl->object_size == 0)
fl->pool_id = -1;
}
EXPORT_SYMBOL(ceph_file_layout_from_legacy);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 3773a4fa11e3..19b7d8aa915c 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -15,6 +15,7 @@ const char *ceph_entity_type_name(int type)
default: return "unknown";
}
}
+EXPORT_SYMBOL(ceph_entity_type_name);
const char *ceph_osd_op_name(int op)
{
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
new file mode 100644
index 000000000000..50f040fdb2a9
--- /dev/null
+++ b/net/ceph/cls_lock_client.c
@@ -0,0 +1,325 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/cls_lock_client.h>
+#include <linux/ceph/decode.h>
+
+/**
+ * ceph_cls_lock - grab rados lock for object
+ * @oid, @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED)
+ * @cookie: user-defined identifier for this instance of the lock
+ * @tag: user-defined tag
+ * @desc: user-defined lock description
+ * @flags: lock flags
+ *
+ * All operations on the same lock should use the same tag.
+ */
+int ceph_cls_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *cookie,
+ char *tag, char *desc, u8 flags)
+{
+ int lock_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ int tag_len = strlen(tag);
+ int desc_len = strlen(desc);
+ void *p, *end;
+ struct page *lock_op_page;
+ struct timespec mtime;
+ int ret;
+
+ lock_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ desc_len + sizeof(__le32) +
+ sizeof(struct ceph_timespec) +
+ /* flag and type */
+ sizeof(u8) + sizeof(u8) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (lock_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ lock_op_page = alloc_page(GFP_NOIO);
+ if (!lock_op_page)
+ return -ENOMEM;
+
+ p = page_address(lock_op_page);
+ end = p + lock_op_buf_size;
+
+ /* encode cls_lock_lock_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ lock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ ceph_encode_string(&p, end, desc, desc_len);
+ /* only support infinite duration */
+ memset(&mtime, 0, sizeof(mtime));
+ ceph_encode_timespec(p, &mtime);
+ p += sizeof(struct ceph_timespec);
+ ceph_encode_8(&p, flags);
+
+ dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
+ __func__, lock_name, type, cookie, tag, desc, flags);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ lock_op_page, lock_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(lock_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_lock);
+
+/**
+ * ceph_cls_unlock - release rados lock for object
+ * @oid, @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @cookie: user-defined identifier for this instance of the lock
+ */
+int ceph_cls_unlock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie)
+{
+ int unlock_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ void *p, *end;
+ struct page *unlock_op_page;
+ int ret;
+
+ unlock_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (unlock_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ unlock_op_page = alloc_page(GFP_NOIO);
+ if (!unlock_op_page)
+ return -ENOMEM;
+
+ p = page_address(unlock_op_page);
+ end = p + unlock_op_buf_size;
+
+ /* encode cls_lock_unlock_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ unlock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+
+ dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ unlock_op_page, unlock_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(unlock_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_unlock);
+
+/**
+ * ceph_cls_break_lock - release rados lock for object for specified client
+ * @oid, @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @cookie: user-defined identifier for this instance of the lock
+ * @locker: current lock owner
+ */
+int ceph_cls_break_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie,
+ struct ceph_entity_name *locker)
+{
+ int break_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ struct page *break_op_page;
+ void *p, *end;
+ int ret;
+
+ break_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ sizeof(u8) + sizeof(__le64) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (break_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ break_op_page = alloc_page(GFP_NOIO);
+ if (!break_op_page)
+ return -ENOMEM;
+
+ p = page_address(break_op_page);
+ end = p + break_op_buf_size;
+
+ /* encode cls_lock_break_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ break_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_copy(&p, locker, sizeof(*locker));
+ ceph_encode_string(&p, end, cookie, cookie_len);
+
+ dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
+ cookie, ENTITY_NAME(*locker));
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ break_op_page, break_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(break_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_break_lock);
+
+void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
+{
+ int i;
+
+ for (i = 0; i < num_lockers; i++)
+ kfree(lockers[i].id.cookie);
+ kfree(lockers);
+}
+EXPORT_SYMBOL(ceph_free_lockers);
+
+static int decode_locker(void **p, void *end, struct ceph_locker *locker)
+{
+ u8 struct_v;
+ u32 len;
+ char *s;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "locker_id_t", &struct_v, &len);
+ if (ret)
+ return ret;
+
+ ceph_decode_copy(p, &locker->id.name, sizeof(locker->id.name));
+ s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ locker->id.cookie = s;
+
+ ret = ceph_start_decoding(p, end, 1, "locker_info_t", &struct_v, &len);
+ if (ret)
+ return ret;
+
+ *p += sizeof(struct ceph_timespec); /* skip expiration */
+ ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr));
+ ceph_decode_addr(&locker->info.addr);
+ len = ceph_decode_32(p);
+ *p += len; /* skip description */
+
+ dout("%s %s%llu cookie %s addr %s\n", __func__,
+ ENTITY_NAME(locker->id.name), locker->id.cookie,
+ ceph_pr_addr(&locker->info.addr.in_addr));
+ return 0;
+}
+
+static int decode_lockers(void **p, void *end, u8 *type, char **tag,
+ struct ceph_locker **lockers, u32 *num_lockers)
+{
+ u8 struct_v;
+ u32 struct_len;
+ char *s;
+ int i;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "cls_lock_get_info_reply",
+ &struct_v, &struct_len);
+ if (ret)
+ return ret;
+
+ *num_lockers = ceph_decode_32(p);
+ *lockers = kcalloc(*num_lockers, sizeof(**lockers), GFP_NOIO);
+ if (!*lockers)
+ return -ENOMEM;
+
+ for (i = 0; i < *num_lockers; i++) {
+ ret = decode_locker(p, end, *lockers + i);
+ if (ret)
+ goto err_free_lockers;
+ }
+
+ *type = ceph_decode_8(p);
+ s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
+ if (IS_ERR(s)) {
+ ret = PTR_ERR(s);
+ goto err_free_lockers;
+ }
+
+ *tag = s;
+ return 0;
+
+err_free_lockers:
+ ceph_free_lockers(*lockers, *num_lockers);
+ return ret;
+}
+
+/*
+ * On success, the caller is responsible for:
+ *
+ * kfree(tag);
+ * ceph_free_lockers(lockers, num_lockers);
+ */
+int ceph_cls_lock_info(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 *type, char **tag,
+ struct ceph_locker **lockers, u32 *num_lockers)
+{
+ int get_info_op_buf_size;
+ int name_len = strlen(lock_name);
+ struct page *get_info_op_page, *reply_page;
+ size_t reply_len;
+ void *p, *end;
+ int ret;
+
+ get_info_op_buf_size = name_len + sizeof(__le32) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (get_info_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ get_info_op_page = alloc_page(GFP_NOIO);
+ if (!get_info_op_page)
+ return -ENOMEM;
+
+ reply_page = alloc_page(GFP_NOIO);
+ if (!reply_page) {
+ __free_page(get_info_op_page);
+ return -ENOMEM;
+ }
+
+ p = page_address(get_info_op_page);
+ end = p + get_info_op_buf_size;
+
+ /* encode cls_lock_get_info_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ get_info_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+
+ dout("%s lock_name %s\n", __func__, lock_name);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
+ CEPH_OSD_FLAG_READ, get_info_op_page,
+ get_info_op_buf_size, reply_page, &reply_len);
+
+ dout("%s: status %d\n", __func__, ret);
+ if (ret >= 0) {
+ p = page_address(reply_page);
+ end = p + reply_len;
+
+ ret = decode_lockers(&p, end, type, tag, lockers, num_lockers);
+ }
+
+ __free_page(get_info_op_page);
+ __free_page(reply_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_lock_info);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 5fcfb98f309e..130ab407c5ec 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -17,10 +17,12 @@
# include <linux/kernel.h>
# include <linux/crush/crush.h>
# include <linux/crush/hash.h>
+# include <linux/crush/mapper.h>
#else
# include "crush_compat.h"
# include "crush.h"
# include "hash.h"
+# include "mapper.h"
#endif
#include "crush_ln_table.h"
@@ -245,7 +247,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
/* compute 2^44*log2(input+1) */
static __u64 crush_ln(unsigned int xin)
{
- unsigned int x = xin, x1;
+ unsigned int x = xin;
int iexpon, index1, index2;
__u64 RH, LH, LL, xl64, result;
@@ -253,9 +255,15 @@ static __u64 crush_ln(unsigned int xin)
/* normalize input */
iexpon = 15;
- while (!(x & 0x18000)) {
- x <<= 1;
- iexpon--;
+
+ /*
+ * figure out number of bits we need to shift and
+ * do it in one step instead of iteratively
+ */
+ if (!(x & 0x18000)) {
+ int bits = __builtin_clz(x & 0x1FFFF) - 16;
+ x <<= bits;
+ iexpon = 15 - bits;
}
index1 = (x >> 8) << 1;
@@ -267,12 +275,11 @@ static __u64 crush_ln(unsigned int xin)
/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
xl64 = (__s64)x * RH;
xl64 >>= 48;
- x1 = xl64;
result = iexpon;
result <<= (12 + 32);
- index2 = x1 & 0xff;
+ index2 = xl64 & 0xff;
/* LL ~ 2^48*log2(1.0+index2/2^15) */
LL = __LL_tbl[index2];
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index db2847ac5f12..292e33bd916e 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -13,14 +13,60 @@
#include <linux/ceph/decode.h>
#include "crypto.h"
+/*
+ * Set ->key and ->tfm. The rest of the key should be filled in before
+ * this function is called.
+ */
+static int set_secret(struct ceph_crypto_key *key, void *buf)
+{
+ unsigned int noio_flag;
+ int ret;
+
+ key->key = NULL;
+ key->tfm = NULL;
+
+ switch (key->type) {
+ case CEPH_CRYPTO_NONE:
+ return 0; /* nothing to do */
+ case CEPH_CRYPTO_AES:
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ WARN_ON(!key->len);
+ key->key = kmemdup(buf, key->len, GFP_NOIO);
+ if (!key->key) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* crypto_alloc_skcipher() allocates with GFP_KERNEL */
+ noio_flag = memalloc_noio_save();
+ key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(key->tfm)) {
+ ret = PTR_ERR(key->tfm);
+ key->tfm = NULL;
+ goto fail;
+ }
+
+ ret = crypto_skcipher_setkey(key->tfm, key->key, key->len);
+ if (ret)
+ goto fail;
+
+ return 0;
+
+fail:
+ ceph_crypto_key_destroy(key);
+ return ret;
+}
+
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
const struct ceph_crypto_key *src)
{
memcpy(dst, src, sizeof(struct ceph_crypto_key));
- dst->key = kmemdup(src->key, src->len, GFP_NOFS);
- if (!dst->key)
- return -ENOMEM;
- return 0;
+ return set_secret(dst, src->key);
}
int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
@@ -37,16 +83,16 @@ int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
{
+ int ret;
+
ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
key->type = ceph_decode_16(p);
ceph_decode_copy(p, &key->created, sizeof(key->created));
key->len = ceph_decode_16(p);
ceph_decode_need(p, end, key->len, bad);
- key->key = kmalloc(key->len, GFP_NOFS);
- if (!key->key)
- return -ENOMEM;
- ceph_decode_copy(p, key->key, key->len);
- return 0;
+ ret = set_secret(key, *p);
+ *p += key->len;
+ return ret;
bad:
dout("failed to decode crypto key\n");
@@ -80,9 +126,14 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
return 0;
}
-static struct crypto_skcipher *ceph_crypto_alloc_cipher(void)
+void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
- return crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (key) {
+ kfree(key->key);
+ key->key = NULL;
+ crypto_free_skcipher(key->tfm);
+ key->tfm = NULL;
+ }
}
static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
@@ -157,372 +208,82 @@ static void teardown_sgtable(struct sg_table *sgt)
sg_free_table(sgt);
}
-static int ceph_aes_encrypt(const void *key, int key_len,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- struct scatterlist sg_in[2], prealloc_sg;
- struct sg_table sg_out;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
- char iv[AES_BLOCK_SIZE];
- size_t zero_padding = (0x10 - (src_len & 0x0f));
- char pad[16];
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- memset(pad, zero_padding, zero_padding);
-
- *dst_len = src_len + zero_padding;
-
- sg_init_table(sg_in, 2);
- sg_set_buf(&sg_in[0], src, src_len);
- sg_set_buf(&sg_in[1], pad, zero_padding);
- ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
- if (ret)
- goto out_tfm;
-
- crypto_skcipher_setkey((void *)tfm, key, key_len);
- memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
- src_len + zero_padding, iv);
-
- /*
- print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
- pad, zero_padding, 1);
- */
- ret = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_crypt failed %d\n", ret);
- goto out_sg;
- }
- /*
- print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
-
-out_sg:
- teardown_sgtable(&sg_out);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
- size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len)
-{
- struct scatterlist sg_in[3], prealloc_sg;
- struct sg_table sg_out;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
- char iv[AES_BLOCK_SIZE];
- size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
- char pad[16];
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- memset(pad, zero_padding, zero_padding);
-
- *dst_len = src1_len + src2_len + zero_padding;
-
- sg_init_table(sg_in, 3);
- sg_set_buf(&sg_in[0], src1, src1_len);
- sg_set_buf(&sg_in[1], src2, src2_len);
- sg_set_buf(&sg_in[2], pad, zero_padding);
- ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
- if (ret)
- goto out_tfm;
-
- crypto_skcipher_setkey((void *)tfm, key, key_len);
- memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
- src1_len + src2_len + zero_padding, iv);
-
- /*
- print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
- src1, src1_len, 1);
- print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
- src2, src2_len, 1);
- print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
- pad, zero_padding, 1);
- */
- ret = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_crypt2 failed %d\n", ret);
- goto out_sg;
- }
- /*
- print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
-
-out_sg:
- teardown_sgtable(&sg_out);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-static int ceph_aes_decrypt(const void *key, int key_len,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len)
+static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len)
{
- struct sg_table sg_in;
- struct scatterlist sg_out[2], prealloc_sg;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- char pad[16];
- char iv[AES_BLOCK_SIZE];
+ SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
+ struct sg_table sgt;
+ struct scatterlist prealloc_sg;
+ char iv[AES_BLOCK_SIZE] __aligned(8);
+ int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1));
+ int crypt_len = encrypt ? in_len + pad_byte : in_len;
int ret;
- int last_byte;
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
- sg_init_table(sg_out, 2);
- sg_set_buf(&sg_out[0], dst, *dst_len);
- sg_set_buf(&sg_out[1], pad, sizeof(pad));
- ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
+ WARN_ON(crypt_len > buf_len);
+ if (encrypt)
+ memset(buf + in_len, pad_byte, pad_byte);
+ ret = setup_sgtable(&sgt, &prealloc_sg, buf, crypt_len);
if (ret)
- goto out_tfm;
+ return ret;
- crypto_skcipher_setkey((void *)tfm, key, key_len);
memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_tfm(req, key->tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
- src_len, iv);
+ skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
/*
- print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
+ print_hex_dump(KERN_ERR, "key: ", DUMP_PREFIX_NONE, 16, 1,
+ key->key, key->len, 1);
+ print_hex_dump(KERN_ERR, " in: ", DUMP_PREFIX_NONE, 16, 1,
+ buf, crypt_len, 1);
*/
- ret = crypto_skcipher_decrypt(req);
- skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_decrypt failed %d\n", ret);
- goto out_sg;
- }
-
- if (src_len <= *dst_len)
- last_byte = ((char *)dst)[src_len - 1];
+ if (encrypt)
+ ret = crypto_skcipher_encrypt(req);
else
- last_byte = pad[src_len - *dst_len - 1];
- if (last_byte <= 16 && src_len >= last_byte) {
- *dst_len = src_len - last_byte;
- } else {
- pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
- last_byte, (int)src_len);
- return -EPERM; /* bad padding */
- }
- /*
- print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
-
-out_sg:
- teardown_sgtable(&sg_in);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-static int ceph_aes_decrypt2(const void *key, int key_len,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len)
-{
- struct sg_table sg_in;
- struct scatterlist sg_out[3], prealloc_sg;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- char pad[16];
- char iv[AES_BLOCK_SIZE];
- int ret;
- int last_byte;
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- sg_init_table(sg_out, 3);
- sg_set_buf(&sg_out[0], dst1, *dst1_len);
- sg_set_buf(&sg_out[1], dst2, *dst2_len);
- sg_set_buf(&sg_out[2], pad, sizeof(pad));
- ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
- if (ret)
- goto out_tfm;
-
- crypto_skcipher_setkey((void *)tfm, key, key_len);
- memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
- src_len, iv);
-
- /*
- print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- */
- ret = crypto_skcipher_decrypt(req);
+ ret = crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_decrypt failed %d\n", ret);
- goto out_sg;
- }
-
- if (src_len <= *dst1_len)
- last_byte = ((char *)dst1)[src_len - 1];
- else if (src_len <= *dst1_len + *dst2_len)
- last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
- else
- last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
- if (last_byte <= 16 && src_len >= last_byte) {
- src_len -= last_byte;
- } else {
- pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
- last_byte, (int)src_len);
- return -EPERM; /* bad padding */
- }
-
- if (src_len < *dst1_len) {
- *dst1_len = src_len;
- *dst2_len = 0;
- } else {
- *dst2_len = src_len - *dst1_len;
+ if (ret) {
+ pr_err("%s %scrypt failed: %d\n", __func__,
+ encrypt ? "en" : "de", ret);
+ goto out_sgt;
}
/*
- print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
- dst1, *dst1_len, 1);
- print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
- dst2, *dst2_len, 1);
+ print_hex_dump(KERN_ERR, "out: ", DUMP_PREFIX_NONE, 16, 1,
+ buf, crypt_len, 1);
*/
-out_sg:
- teardown_sgtable(&sg_in);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src_len)
- return -ERANGE;
- memcpy(dst, src, src_len);
- *dst_len = src_len;
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_decrypt(secret->key, secret->len, dst,
- dst_len, src, src_len);
-
- default:
- return -EINVAL;
- }
-}
-
-int ceph_decrypt2(struct ceph_crypto_key *secret,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len)
-{
- size_t t;
-
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst1_len + *dst2_len < src_len)
- return -ERANGE;
- t = min(*dst1_len, src_len);
- memcpy(dst1, src, t);
- *dst1_len = t;
- src += t;
- src_len -= t;
- if (src_len) {
- t = min(*dst2_len, src_len);
- memcpy(dst2, src, t);
- *dst2_len = t;
+ if (encrypt) {
+ *pout_len = crypt_len;
+ } else {
+ pad_byte = *(char *)(buf + in_len - 1);
+ if (pad_byte > 0 && pad_byte <= AES_BLOCK_SIZE &&
+ in_len >= pad_byte) {
+ *pout_len = in_len - pad_byte;
+ } else {
+ pr_err("%s got bad padding %d on in_len %d\n",
+ __func__, pad_byte, in_len);
+ ret = -EPERM;
+ goto out_sgt;
}
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_decrypt2(secret->key, secret->len,
- dst1, dst1_len, dst2, dst2_len,
- src, src_len);
-
- default:
- return -EINVAL;
}
-}
-
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src_len)
- return -ERANGE;
- memcpy(dst, src, src_len);
- *dst_len = src_len;
- return 0;
- case CEPH_CRYPTO_AES:
- return ceph_aes_encrypt(secret->key, secret->len, dst,
- dst_len, src, src_len);
-
- default:
- return -EINVAL;
- }
+out_sgt:
+ teardown_sgtable(&sgt);
+ return ret;
}
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len)
+int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len)
{
- switch (secret->type) {
+ switch (key->type) {
case CEPH_CRYPTO_NONE:
- if (*dst_len < src1_len + src2_len)
- return -ERANGE;
- memcpy(dst, src1, src1_len);
- memcpy(dst + src1_len, src2, src2_len);
- *dst_len = src1_len + src2_len;
+ *pout_len = in_len;
return 0;
-
case CEPH_CRYPTO_AES:
- return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
- src1, src1_len, src2, src2_len);
-
+ return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len,
+ pout_len);
default:
- return -EINVAL;
+ return -ENOTSUPP;
}
}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 2e9cab09f37b..58d83aa7740f 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -12,37 +12,19 @@ struct ceph_crypto_key {
struct ceph_timespec created;
int len;
void *key;
+ struct crypto_skcipher *tfm;
};
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
- if (key) {
- kfree(key->key);
- key->key = NULL;
- }
-}
-
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
const struct ceph_crypto_key *src);
int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end);
int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end);
int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+void ceph_crypto_key_destroy(struct ceph_crypto_key *key);
/* crypto.c */
-int ceph_decrypt(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len);
-int ceph_encrypt(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len);
-int ceph_decrypt2(struct ceph_crypto_key *secret,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len);
-int ceph_encrypt2(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len);
+int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len);
int ceph_crypto_init(void);
void ceph_crypto_shutdown(void);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a5502898ea33..bad3d4ae43f6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1393,15 +1393,9 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
return NULL;
}
- /* Can't hold the mutex while getting authorizer */
- mutex_unlock(&con->mutex);
auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
- mutex_lock(&con->mutex);
-
if (IS_ERR(auth))
return auth;
- if (con->state != CON_STATE_NEGOTIATING)
- return ERR_PTR(-EAGAIN);
con->auth_reply_buf = auth->authorizer_reply_buf;
con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
@@ -2027,6 +2021,19 @@ static int process_connect(struct ceph_connection *con)
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+ if (con->auth_reply_buf) {
+ /*
+ * Any connection that defines ->get_authorizer()
+ * should also define ->verify_authorizer_reply().
+ * See get_connect_authorizer().
+ */
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
+ }
+
switch (con->in_reply.tag) {
case CEPH_MSGR_TAG_FEATURES:
pr_err("%s%lld %s feature set mismatch,"
@@ -3418,7 +3425,7 @@ static void ceph_msg_release(struct kref *kref)
struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
{
dout("%s %p (was %d)\n", __func__, msg,
- atomic_read(&msg->kref.refcount));
+ kref_read(&msg->kref));
kref_get(&msg->kref);
return msg;
}
@@ -3427,7 +3434,7 @@ EXPORT_SYMBOL(ceph_msg_get);
void ceph_msg_put(struct ceph_msg *msg)
{
dout("%s %p (was %d)\n", __func__, msg,
- atomic_read(&msg->kref.refcount));
+ kref_read(&msg->kref));
kref_put(&msg->kref, ceph_msg_release);
}
EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ef34a02719d7..29a0ef351c5e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -835,6 +835,83 @@ int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
}
EXPORT_SYMBOL(ceph_monc_get_version_async);
+static void handle_command_ack(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+ ceph_decode_need(&p, end, sizeof(struct ceph_mon_request_header) +
+ sizeof(u32), bad);
+ p += sizeof(struct ceph_mon_request_header);
+
+ mutex_lock(&monc->mutex);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
+ }
+
+ req->result = ceph_decode_32(&p);
+ __finish_generic_request(req);
+ mutex_unlock(&monc->mutex);
+
+ complete_generic_request(req);
+ return;
+
+bad:
+ pr_err("corrupt mon_command ack, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
+ struct ceph_entity_addr *client_addr)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_command *h;
+ int ret = -ENOMEM;
+ int len;
+
+ req = alloc_generic_request(monc, GFP_NOIO);
+ if (!req)
+ goto out;
+
+ req->request = ceph_msg_new(CEPH_MSG_MON_COMMAND, 256, GFP_NOIO, true);
+ if (!req->request)
+ goto out;
+
+ req->reply = ceph_msg_new(CEPH_MSG_MON_COMMAND_ACK, 512, GFP_NOIO,
+ true);
+ if (!req->reply)
+ goto out;
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
+ h = req->request->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+ h->num_strs = cpu_to_le32(1);
+ len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
+ \"blacklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr, le32_to_cpu(client_addr->nonce));
+ h->str_len = cpu_to_le32(len);
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
+
+ ret = wait_generic_request(req);
+out:
+ put_generic_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_monc_blacklist_add);
+
/*
* Resend pending generic requests.
*/
@@ -951,21 +1028,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
err = -ENOMEM;
monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
sizeof(struct ceph_mon_subscribe_ack),
- GFP_NOFS, true);
+ GFP_KERNEL, true);
if (!monc->m_subscribe_ack)
goto out_auth;
- monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
- true);
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128,
+ GFP_KERNEL, true);
if (!monc->m_subscribe)
goto out_subscribe_ack;
- monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
- true);
+ monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096,
+ GFP_KERNEL, true);
if (!monc->m_auth_reply)
goto out_subscribe;
- monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_KERNEL, true);
monc->pending_auth = 0;
if (!monc->m_auth)
goto out_auth_reply;
@@ -1139,6 +1216,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
handle_get_version_reply(monc, msg);
break;
+ case CEPH_MSG_MON_COMMAND_ACK:
+ handle_command_ack(monc, msg);
+ break;
+
case CEPH_MSG_MON_MAP:
ceph_monc_handle_map(monc, msg);
break;
@@ -1178,6 +1259,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
m = ceph_msg_get(monc->m_subscribe_ack);
break;
case CEPH_MSG_STATFS_REPLY:
+ case CEPH_MSG_MON_COMMAND_ACK:
return get_generic_reply(con, hdr, skip);
case CEPH_MSG_AUTH_REPLY:
m = ceph_msg_get(monc->m_auth_reply);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a97e7b506612..f3378ba1a828 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -338,6 +338,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
ceph_osd_data_release(&op->notify.request_data);
ceph_osd_data_release(&op->notify.response_data);
break;
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ ceph_osd_data_release(&op->list_watchers.response_data);
+ break;
default:
break;
}
@@ -435,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
void ceph_osdc_get_request(struct ceph_osd_request *req)
{
dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
+ kref_read(&req->r_kref));
kref_get(&req->r_kref);
}
EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -444,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
{
if (req) {
dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
+ kref_read(&req->r_kref));
kref_put(&req->r_kref, ceph_osdc_release_request);
}
}
@@ -457,7 +460,7 @@ static void request_init(struct ceph_osd_request *req)
kref_init(&req->r_kref);
init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
+ init_completion(&req->r_done_completion);
RB_CLEAR_NODE(&req->r_node);
RB_CLEAR_NODE(&req->r_mc_node);
INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -484,11 +487,11 @@ static void request_reinit(struct ceph_osd_request *req)
struct ceph_msg *reply_msg = req->r_reply;
dout("%s req %p\n", __func__, req);
- WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+ WARN_ON(kref_read(&req->r_kref) != 1);
request_release_checks(req);
- WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
- WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+ WARN_ON(kref_read(&request_msg->kref) != 1);
+ WARN_ON(kref_read(&reply_msg->kref) != 1);
target_destroy(&req->r_t);
request_init(req);
@@ -863,6 +866,8 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
case CEPH_OSD_OP_NOTIFY:
dst->notify.cookie = cpu_to_le64(src->notify.cookie);
break;
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ break;
case CEPH_OSD_OP_SETALLOCHINT:
dst->alloc_hint.expected_object_size =
cpu_to_le64(src->alloc_hint.expected_object_size);
@@ -1445,6 +1450,10 @@ static void setup_request_data(struct ceph_osd_request *req,
ceph_osdc_msg_data_add(req->r_reply,
&op->extent.osd_data);
break;
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->list_watchers.response_data);
+ break;
/* both */
case CEPH_OSD_OP_CALL:
@@ -1716,7 +1725,7 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
__submit_request(req, wrlocked);
}
-static void __finish_request(struct ceph_osd_request *req)
+static void finish_request(struct ceph_osd_request *req)
{
struct ceph_osd_client *osdc = req->r_osdc;
struct ceph_osd *osd = req->r_osd;
@@ -1738,12 +1747,6 @@ static void __finish_request(struct ceph_osd_request *req)
ceph_msg_revoke_incoming(req->r_reply);
}
-static void finish_request(struct ceph_osd_request *req)
-{
- __finish_request(req);
- ceph_osdc_put_request(req);
-}
-
static void __complete_request(struct ceph_osd_request *req)
{
if (req->r_callback)
@@ -1761,9 +1764,9 @@ static void complete_request(struct ceph_osd_request *req, int err)
dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
req->r_result = err;
- __finish_request(req);
+ finish_request(req);
__complete_request(req);
- complete_all(&req->r_safe_completion);
+ complete_all(&req->r_done_completion);
ceph_osdc_put_request(req);
}
@@ -1789,6 +1792,8 @@ static void cancel_request(struct ceph_osd_request *req)
cancel_map_check(req);
finish_request(req);
+ complete_all(&req->r_done_completion);
+ ceph_osdc_put_request(req);
}
static void check_pool_dne(struct ceph_osd_request *req)
@@ -2799,12 +2804,12 @@ static bool done_request(const struct ceph_osd_request *req,
* ->r_unsafe_callback is set? yes no
*
* first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
- * any or needed/got safe) r_safe_completion r_safe_completion
+ * any or needed/got safe) r_done_completion r_done_completion
*
* first reply is unsafe r_unsafe_cb(true) (nothing)
*
* when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
- * r_safe_completion r_safe_completion
+ * r_done_completion r_done_completion
*/
static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
{
@@ -2906,7 +2911,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
}
if (done_request(req, &m)) {
- __finish_request(req);
+ finish_request(req);
if (req->r_linger) {
WARN_ON(req->r_unsafe_callback);
dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
@@ -2925,8 +2930,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
dout("req %p tid %llu cb\n", req, req->r_tid);
__complete_request(req);
}
- if (m.flags & CEPH_OSD_FLAG_ONDISK)
- complete_all(&req->r_safe_completion);
+ complete_all(&req->r_done_completion);
ceph_osdc_put_request(req);
} else {
if (req->r_unsafe_callback) {
@@ -3462,9 +3466,8 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
EXPORT_SYMBOL(ceph_osdc_start_request);
/*
- * Unregister a registered request. The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
+ * Unregister a registered request. The request is not completed:
+ * ->r_result isn't set and __complete_request() isn't called.
*/
void ceph_osdc_cancel_request(struct ceph_osd_request *req)
{
@@ -3491,9 +3494,6 @@ static int wait_request_timeout(struct ceph_osd_request *req,
if (left <= 0) {
left = left ?: -ETIMEDOUT;
ceph_osdc_cancel_request(req);
-
- /* kludge - need to to wake ceph_osdc_sync() */
- complete_all(&req->r_safe_completion);
} else {
left = req->r_result; /* completed */
}
@@ -3540,7 +3540,7 @@ again:
up_read(&osdc->lock);
dout("%s waiting on req %p tid %llu last_tid %llu\n",
__func__, req, req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
+ wait_for_completion(&req->r_done_completion);
ceph_osdc_put_request(req);
goto again;
}
@@ -3891,12 +3891,121 @@ int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
return ret;
}
+static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
+{
+ u8 struct_v;
+ u32 struct_len;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 2, "watch_item_t",
+ &struct_v, &struct_len);
+ if (ret)
+ return ret;
+
+ ceph_decode_copy(p, &item->name, sizeof(item->name));
+ item->cookie = ceph_decode_64(p);
+ *p += 4; /* skip timeout_seconds */
+ if (struct_v >= 2) {
+ ceph_decode_copy(p, &item->addr, sizeof(item->addr));
+ ceph_decode_addr(&item->addr);
+ }
+
+ dout("%s %s%llu cookie %llu addr %s\n", __func__,
+ ENTITY_NAME(item->name), item->cookie,
+ ceph_pr_addr(&item->addr.in_addr));
+ return 0;
+}
+
+static int decode_watchers(void **p, void *end,
+ struct ceph_watch_item **watchers,
+ u32 *num_watchers)
+{
+ u8 struct_v;
+ u32 struct_len;
+ int i;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
+ &struct_v, &struct_len);
+ if (ret)
+ return ret;
+
+ *num_watchers = ceph_decode_32(p);
+ *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO);
+ if (!*watchers)
+ return -ENOMEM;
+
+ for (i = 0; i < *num_watchers; i++) {
+ ret = decode_watcher(p, end, *watchers + i);
+ if (ret) {
+ kfree(*watchers);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * On success, the caller is responsible for:
+ *
+ * kfree(watchers);
+ */
+int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_watch_item **watchers,
+ u32 *num_watchers)
+{
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out_put_req;
+ }
+
+ osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
+ ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
+ response_data),
+ pages, PAGE_SIZE, 0, false, true);
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+ if (ret >= 0) {
+ void *p = page_address(pages[0]);
+ void *const end = p + req->r_ops[0].outdata_len;
+
+ ret = decode_watchers(&p, end, watchers, num_watchers);
+ }
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_list_watchers);
+
/*
* Call all pending notify callbacks - for use after a watch is
* unregistered, to make sure no more callbacks for it will be invoked
*/
void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
{
+ dout("%s osdc %p\n", __func__, osdc);
flush_workqueue(osdc->notify_wq);
}
EXPORT_SYMBOL(ceph_osdc_flush_notifies);
@@ -3910,6 +4019,57 @@ void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
/*
+ * Execute an OSD class method on an object.
+ *
+ * @flags: CEPH_OSD_FLAG_*
+ * @resp_len: out param for reply length
+ */
+int ceph_osdc_call(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ const char *class, const char *method,
+ unsigned int flags,
+ struct page *req_page, size_t req_len,
+ struct page *resp_page, size_t *resp_len)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = flags;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
+ if (req_page)
+ osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
+ 0, false, false);
+ if (resp_page)
+ osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+ PAGE_SIZE, 0, false, false);
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+ if (ret >= 0) {
+ ret = req->r_ops[0].rval;
+ if (resp_page)
+ *resp_len = req->r_ops[0].outdata_len;
+ }
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_call);
+
+/*
* init, shutdown
*/
int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
@@ -3925,6 +4085,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
osd_init(&osdc->homeless_osd);
osdc->homeless_osd.o_osdc = osdc;
osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+ osdc->last_linger_id = CEPH_LINGER_ID_START;
osdc->linger_requests = RB_ROOT;
osdc->map_checks = RB_ROOT;
osdc->linger_map_checks = RB_ROOT;
@@ -4308,13 +4469,13 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
}
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
+static int verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
- return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
+ return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
}
static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 00d2601407c5..1a7c9a79a53c 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -26,7 +26,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
while (got < num_pages) {
rc = get_user_pages_unlocked(
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, 0, pages + got);
+ num_pages - got, pages + got, write_page ? FOLL_WRITE : 0);
if (rc < 0)
break;
BUG_ON(rc == 0);
diff --git a/net/compat.c b/net/compat.c
index a96fd2f3507b..d69f539ca0bc 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -29,7 +29,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/ipv6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/compat.h>
int get_compat_msghdr(struct msghdr *kmsg,
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..f6761b6e3b29 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f8d5d3..ea633342ab0d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -36,7 +36,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
@@ -165,6 +165,7 @@ done:
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
+ * @destructor: invoked under the receive lock on successful dequeue
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
@@ -197,6 +198,8 @@ done:
* the standard around please.
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err,
struct sk_buff **last)
{
@@ -211,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
+ *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -224,26 +228,28 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
*last = skb;
- *peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
skb->peeked)) {
_off -= skb->len;
continue;
}
-
- skb = skb_set_peeked(skb);
- error = PTR_ERR(skb);
- if (IS_ERR(skb)) {
- spin_unlock_irqrestore(&queue->lock,
- cpu_flags);
- goto no_packet;
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ error = PTR_ERR(skb);
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
}
-
+ *peeked = 1;
atomic_inc(&skb->users);
- } else
+ } else {
__skb_unlink(skb, queue);
-
+ if (destructor)
+ destructor(sk, skb);
+ }
spin_unlock_irqrestore(&queue->lock, cpu_flags);
*off = _off;
return skb;
@@ -262,6 +268,8 @@ no_packet:
EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
@@ -270,8 +278,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
+ off, err, &last);
if (skb)
return skb;
@@ -290,7 +298,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, err);
+ NULL, &peeked, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -323,6 +331,31 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
}
EXPORT_SYMBOL(__skb_free_datagram_locked);
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb))
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ if (destructor)
+ destructor(sk, skb);
+ err = 0;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ atomic_inc(&sk->sk_drops);
+ return err;
+}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
+
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
@@ -346,23 +379,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
- int err = 0;
-
- if (flags & MSG_PEEK) {
- err = -ENOENT;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- atomic_dec(&skb->users);
- err = 0;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- }
+ int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
kfree_skb(skb);
- atomic_inc(&sk->sk_drops);
sk_mem_reclaim_partial(sk);
-
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index ea6312057a71..29101c98399f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -72,7 +72,7 @@
* - netif_rx() feedback
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -139,7 +139,6 @@
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
-#include <linux/sctp.h>
#include <linux/crash_dump.h>
#include "net-sysfs.h"
@@ -1696,24 +1695,19 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
-/* We are not allowed to call static_key_slow_dec() from irq context
- * If net_disable_timestamp() is called from irq context, defer the
- * static_key_slow_dec() calls.
- */
static atomic_t netstamp_needed_deferred;
-#endif
-
-void net_enable_timestamp(void)
+static void netstamp_clear(struct work_struct *work)
{
-#ifdef HAVE_JUMP_LABEL
int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
- if (deferred) {
- while (--deferred)
- static_key_slow_dec(&netstamp_needed);
- return;
- }
+ while (deferred--)
+ static_key_slow_dec(&netstamp_needed);
+}
+static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
+
+void net_enable_timestamp(void)
+{
static_key_slow_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
@@ -1721,25 +1715,25 @@ EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
- if (in_interrupt()) {
- atomic_inc(&netstamp_needed_deferred);
- return;
- }
-#endif
+ /* net_disable_timestamp() can be called from non process context */
+ atomic_inc(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
static_key_slow_dec(&netstamp_needed);
+#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
if (static_key_false(&netstamp_needed))
__net_timestamp(skb);
}
#define net_timestamp_check(COND, SKB) \
if (static_key_false(&netstamp_needed)) { \
- if ((COND) && !(SKB)->tstamp.tv64) \
+ if ((COND) && !(SKB)->tstamp) \
__net_timestamp(SKB); \
} \
@@ -1766,19 +1760,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_orphan_frags(skb, GFP_ATOMIC) ||
- unlikely(!is_skb_forwardable(dev, skb))) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
+ int ret = ____dev_forward_skb(dev, skb);
- skb_scrub_packet(skb, true);
- skb->priority = 0;
- skb->protocol = eth_type_trans(skb, dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ if (likely(!ret)) {
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ }
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
@@ -1949,37 +1938,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
}
}
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+ if (dev->num_tc) {
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+ int i;
+
+ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+ if ((txq - tc->offset) < tc->count)
+ return i;
+ }
+
+ return -1;
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_XPS
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
-static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
- int cpu, u16 index)
+static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ return false;
- for (pos = 0; map && pos < map->len; pos++) {
- if (map->queues[pos] == index) {
- if (map->len > 1) {
- map->queues[pos] = map->queues[--map->len];
- } else {
- RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
- kfree_rcu(map, rcu);
- map = NULL;
- }
+ for (pos = map->len; pos--;) {
+ if (map->queues[pos] != index)
+ continue;
+
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
break;
}
+
+ RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ return false;
}
- return map;
+ return true;
}
-static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+static bool remove_xps_queue_cpu(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ int cpu, u16 offset, u16 count)
+{
+ int num_tc = dev->num_tc ? : 1;
+ bool active = false;
+ int tci;
+
+ for (tci = cpu * num_tc; num_tc--; tci++) {
+ int i, j;
+
+ for (i = count, j = offset; i--; j++) {
+ if (!remove_xps_queue(dev_maps, cpu, j))
+ break;
+ }
+
+ active |= i < 0;
+ }
+
+ return active;
+}
+
+static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
+ u16 count)
{
struct xps_dev_maps *dev_maps;
int cpu, i;
@@ -1991,21 +2023,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
if (!dev_maps)
goto out_no_maps;
- for_each_possible_cpu(cpu) {
- for (i = index; i < dev->num_tx_queues; i++) {
- if (!remove_xps_queue(dev_maps, cpu, i))
- break;
- }
- if (i == dev->num_tx_queues)
- active = true;
- }
+ for_each_possible_cpu(cpu)
+ active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
+ offset, count);
if (!active) {
RCU_INIT_POINTER(dev->xps_maps, NULL);
kfree_rcu(dev_maps, rcu);
}
- for (i = index; i < dev->num_tx_queues; i++)
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
NUMA_NO_NODE);
@@ -2013,6 +2040,11 @@ out_no_maps:
mutex_unlock(&xps_map_mutex);
}
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
+}
+
static struct xps_map *expand_xps_map(struct xps_map *map,
int cpu, u16 index)
{
@@ -2052,20 +2084,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ int i, cpu, tci, numa_node_id = -2;
+ int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
- int cpu, numa_node_id = -2;
bool active = false;
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
+ maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
+
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps);
/* allocate memory for queue storage */
- for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, mask))
- continue;
-
+ for_each_cpu_and(cpu, cpu_online_mask, mask) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2073,25 +2113,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ tci = cpu * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
NULL;
map = expand_xps_map(map, cpu, index);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
for_each_possible_cpu(cpu) {
+ /* copy maps belonging to foreign traffic classes */
+ for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
+
+ /* We need to explicitly update tci as prevous loop
+ * could break out early if dev_maps is NULL.
+ */
+ tci = cpu * num_tc + tc;
+
if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
/* add queue to CPU maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(new_dev_maps->cpu_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2105,26 +2158,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
+ /* copy maps belonging to foreign traffic classes */
+ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
}
rcu_assign_pointer(dev->xps_maps, new_dev_maps);
/* Cleanup old maps */
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (!dev_maps)
+ goto out_no_old_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
-
- kfree_rcu(dev_maps, rcu);
}
+ kfree_rcu(dev_maps, rcu);
+
+out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
@@ -2139,11 +2202,12 @@ out_no_new_maps:
/* removes queue from unused CPUs */
for_each_possible_cpu(cpu) {
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
- continue;
-
- if (remove_xps_queue(dev_maps, cpu, index))
- active = true;
+ for (i = tc, tci = cpu * num_tc; i--; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
+ if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ active |= remove_xps_queue(dev_maps, tci, index);
+ for (i = num_tc - tc, tci++; --i; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
}
/* free map if not active */
@@ -2159,11 +2223,14 @@ out_no_maps:
error:
/* remove any maps that we added */
for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
- NULL;
- if (new_map && new_map != map)
- kfree(new_map);
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = dev_maps ?
+ xmap_dereference(dev_maps->cpu_map[tci]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
}
mutex_unlock(&xps_map_mutex);
@@ -2174,6 +2241,44 @@ error:
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
+void netdev_reset_tc(struct net_device *dev)
+{
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = 0;
+ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+ if (tc >= dev->num_tc)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues(dev, offset, count);
+#endif
+ dev->tc_to_txq[tc].count = count;
+ dev->tc_to_txq[tc].offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+ if (num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = num_tc;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2484,7 +2589,7 @@ int skb_checksum_help(struct sk_buff *skb)
goto out;
}
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
out:
@@ -2492,141 +2597,6 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/* skb_csum_offload_check - Driver helper function to determine if a device
- * with limited checksum offload capabilities is able to offload the checksum
- * for a given packet.
- *
- * Arguments:
- * skb - sk_buff for the packet in question
- * spec - contains the description of what device can offload
- * csum_encapped - returns true if the checksum being offloaded is
- * encpasulated. That is it is checksum for the transport header
- * in the inner headers.
- * checksum_help - when set indicates that helper function should
- * call skb_checksum_help if offload checks fail
- *
- * Returns:
- * true: Packet has passed the checksum checks and should be offloadable to
- * the device (a driver may still need to check for additional
- * restrictions of its device)
- * false: Checksum is not offloadable. If checksum_help was set then
- * skb_checksum_help was called to resolve checksum for non-GSO
- * packets and when IP protocol is not SCTP
- */
-bool __skb_csum_offload_chk(struct sk_buff *skb,
- const struct skb_csum_offl_spec *spec,
- bool *csum_encapped,
- bool csum_help)
-{
- struct iphdr *iph;
- struct ipv6hdr *ipv6;
- void *nhdr;
- int protocol;
- u8 ip_proto;
-
- if (skb->protocol == htons(ETH_P_8021Q) ||
- skb->protocol == htons(ETH_P_8021AD)) {
- if (!spec->vlan_okay)
- goto need_help;
- }
-
- /* We check whether the checksum refers to a transport layer checksum in
- * the outermost header or an encapsulated transport layer checksum that
- * corresponds to the inner headers of the skb. If the checksum is for
- * something else in the packet we need help.
- */
- if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
- /* Non-encapsulated checksum */
- protocol = eproto_to_ipproto(vlan_get_protocol(skb));
- nhdr = skb_network_header(skb);
- *csum_encapped = false;
- if (spec->no_not_encapped)
- goto need_help;
- } else if (skb->encapsulation && spec->encap_okay &&
- skb_checksum_start_offset(skb) ==
- skb_inner_transport_offset(skb)) {
- /* Encapsulated checksum */
- *csum_encapped = true;
- switch (skb->inner_protocol_type) {
- case ENCAP_TYPE_ETHER:
- protocol = eproto_to_ipproto(skb->inner_protocol);
- break;
- case ENCAP_TYPE_IPPROTO:
- protocol = skb->inner_protocol;
- break;
- }
- nhdr = skb_inner_network_header(skb);
- } else {
- goto need_help;
- }
-
- switch (protocol) {
- case IPPROTO_IP:
- if (!spec->ipv4_okay)
- goto need_help;
- iph = nhdr;
- ip_proto = iph->protocol;
- if (iph->ihl != 5 && !spec->ip_options_okay)
- goto need_help;
- break;
- case IPPROTO_IPV6:
- if (!spec->ipv6_okay)
- goto need_help;
- if (spec->no_encapped_ipv6 && *csum_encapped)
- goto need_help;
- ipv6 = nhdr;
- nhdr += sizeof(*ipv6);
- ip_proto = ipv6->nexthdr;
- break;
- default:
- goto need_help;
- }
-
-ip_proto_again:
- switch (ip_proto) {
- case IPPROTO_TCP:
- if (!spec->tcp_okay ||
- skb->csum_offset != offsetof(struct tcphdr, check))
- goto need_help;
- break;
- case IPPROTO_UDP:
- if (!spec->udp_okay ||
- skb->csum_offset != offsetof(struct udphdr, check))
- goto need_help;
- break;
- case IPPROTO_SCTP:
- if (!spec->sctp_okay ||
- skb->csum_offset != offsetof(struct sctphdr, checksum))
- goto cant_help;
- break;
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_DEST: {
- u8 *opthdr = nhdr;
-
- if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
- goto need_help;
-
- ip_proto = opthdr[0];
- nhdr += (opthdr[1] + 1) << 3;
-
- goto ip_proto_again;
- }
- default:
- goto need_help;
- }
-
- /* Passed the tests for offloading checksum */
- return true;
-
-need_help:
- if (csum_help && !skb_shinfo(skb)->gso_size)
- skb_checksum_help(skb);
-cant_help:
- return false;
-}
-EXPORT_SYMBOL(__skb_csum_offload_chk);
-
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -2820,9 +2790,9 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
- } else if (illegal_highdma(skb->dev, skb)) {
- features &= ~NETIF_F_SG;
}
+ if (illegal_highdma(skb->dev, skb))
+ features &= ~NETIF_F_SG;
return features;
}
@@ -3035,6 +3005,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
return head;
}
+EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
@@ -3220,8 +3191,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[skb->sender_cpu - 1]);
+ unsigned int tci = skb->sender_cpu - 1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
@@ -3355,16 +3332,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
-#ifdef CONFIG_NET_SWITCHDEV
- /* Don't forward if offload device already forwarded */
- if (skb->offload_fwd_mark &&
- skb->offload_fwd_mark == dev->offload_fwd_mark) {
- consume_skb(skb);
- rc = NET_XMIT_SUCCESS;
- goto out;
- }
-#endif
-
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -3475,6 +3442,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3855,7 +3824,7 @@ int netif_rx_ni(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx_ni);
-static void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -3914,8 +3883,7 @@ static void net_tx_action(struct softirq_action *h)
}
}
-#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
- (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
@@ -4066,12 +4034,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
{
#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
+ int ingress_retval;
+
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- return nf_hook_ingress(skb);
+ rcu_read_lock();
+ ingress_retval = nf_hook_ingress(skb);
+ rcu_read_unlock();
+ return ingress_retval;
}
#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
@@ -4308,32 +4281,53 @@ int netif_receive_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb);
-/* Network device is going away, flush any packets still pending
- * Called with irqs disabled.
- */
-static void flush_backlog(void *arg)
+DEFINE_PER_CPU(struct work_struct, flush_works);
+
+/* Network device is going away, flush any packets still pending */
+static void flush_backlog(struct work_struct *work)
{
- struct net_device *dev = arg;
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
struct sk_buff *skb, *tmp;
+ struct softnet_data *sd;
+ local_bh_disable();
+ sd = this_cpu_ptr(&softnet_data);
+
+ local_irq_disable();
rps_lock(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev == dev) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
rps_unlock(sd);
+ local_irq_enable();
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev == dev) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
+ local_bh_enable();
+}
+
+static void flush_all_backlogs(void)
+{
+ unsigned int cpu;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(&flush_works, cpu));
+
+ put_online_cpus();
}
static int napi_gro_complete(struct sk_buff *skb)
@@ -4442,7 +4436,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
}
}
@@ -4480,7 +4476,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
- if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
+ if (skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
@@ -4493,9 +4489,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = 0;
+ NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->recursion_counter = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
@@ -4821,8 +4818,9 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
static int process_backlog(struct napi_struct *napi, int quota)
{
- int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+ bool again = true;
+ int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
@@ -4833,23 +4831,20 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
napi->weight = weight_p;
- local_irq_disable();
- while (1) {
+ while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
- local_irq_enable();
__netif_receive_skb(skb);
rcu_read_unlock();
- local_irq_disable();
input_queue_head_incr(sd);
- if (++work >= quota) {
- local_irq_enable();
+ if (++work >= quota)
return work;
- }
+
}
+ local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
@@ -4861,16 +4856,14 @@ static int process_backlog(struct napi_struct *napi, int quota)
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
- rps_unlock(sd);
-
- break;
+ again = false;
+ } else {
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
}
-
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
rps_unlock(sd);
+ local_irq_enable();
}
- local_irq_enable();
return work;
}
@@ -4904,26 +4897,36 @@ void __napi_schedule_irqoff(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
-void __napi_complete(struct napi_struct *n)
+bool __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ /* Some drivers call us directly, instead of calling
+ * napi_complete_done().
+ */
+ if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
+ return false;
+
list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
+ return true;
}
EXPORT_SYMBOL(__napi_complete);
-void napi_complete_done(struct napi_struct *n, int work_done)
+bool napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;
/*
- * don't let napi dequeue from the cpu poll list
- * just in case its running on a different cpu
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
*/
- if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
- return;
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
+ return false;
if (n->gro_list) {
unsigned long timeout = 0;
@@ -4945,6 +4948,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
__napi_complete(n);
local_irq_restore(flags);
}
+ return true;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -4962,13 +4966,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
+
#define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+ int rc;
+
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == BUSY_POLL_BUDGET)
+ __napi_schedule(napi);
+ local_bh_enable();
+ if (local_softirq_pending())
+ do_softirq();
+}
+
bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+ int (*napi_poll)(struct napi_struct *napi, int budget);
int (*busy_poll)(struct napi_struct *dev);
+ void *have_poll_lock = NULL;
struct napi_struct *napi;
- int rc = false;
+ int rc;
+
+restart:
+ rc = false;
+ napi_poll = NULL;
rcu_read_lock();
@@ -4979,24 +5011,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
/* Note: ndo_busy_poll method is optional in linux-4.5 */
busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
- do {
+ preempt_disable();
+ for (;;) {
rc = 0;
local_bh_disable();
if (busy_poll) {
rc = busy_poll(napi);
- } else if (napi_schedule_prep(napi)) {
- void *have = netpoll_poll_lock(napi);
-
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
- if (rc == BUSY_POLL_BUDGET) {
- napi_complete_done(napi, rc);
- napi_schedule(napi);
- }
- }
- netpoll_poll_unlock(have);
+ goto count;
+ }
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL))
+ goto count;
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val)
+ goto count;
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
}
+ rc = napi_poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+count:
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
@@ -5005,10 +5046,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */
- cpu_relax();
- } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
- !need_resched() && !busy_loop_timeout(end_time));
+ if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+ busy_loop_timeout(end_time))
+ break;
+ if (unlikely(need_resched())) {
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+ if (rc || busy_loop_timeout(end_time))
+ return rc;
+ goto restart;
+ }
+ cpu_relax();
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
@@ -5018,7 +5075,7 @@ EXPORT_SYMBOL(sk_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
-void napi_hash_add(struct napi_struct *napi)
+static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
@@ -5038,7 +5095,6 @@ void napi_hash_add(struct napi_struct *napi)
spin_unlock(&napi_hash_lock);
}
-EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
@@ -5086,7 +5142,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
- spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
@@ -5187,7 +5242,7 @@ out_unlock:
return work;
}
-static void net_rx_action(struct softirq_action *h)
+static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
@@ -5204,7 +5259,7 @@ static void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- return;
+ goto out;
break;
}
@@ -5222,7 +5277,6 @@ static void net_rx_action(struct softirq_action *h)
}
}
- __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -5232,6 +5286,8 @@ static void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
+out:
+ __kfree_skb_flush();
}
struct netdev_adjacent {
@@ -5262,6 +5318,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+ struct net_device *dev = data;
+
+ return upper_dev == dev;
+}
+
/**
* netdev_has_upper_dev - Check if device is linked to an upper device
* @dev: device
@@ -5276,11 +5339,30 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
+ return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
+/**
* netdev_has_any_upper_dev - Check if device is linked to some device
* @dev: device
*
@@ -5291,7 +5373,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
- return !list_empty(&dev->all_adj_list.upper);
+ return !list_empty(&dev->adj_list.upper);
}
/**
@@ -5318,6 +5400,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.lower);
+}
+
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
struct netdev_adjacent *adj;
@@ -5354,16 +5450,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-/**
- * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next device from the dev's upper list, starting from iter
- * position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *upper;
@@ -5371,14 +5459,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
- if (&upper->list == &dev->all_adj_list.upper)
+ if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
-EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.upper,
+ udev = netdev_next_upper_dev_rcu(dev, &iter);
+ udev;
+ udev = netdev_next_upper_dev_rcu(dev, &iter)) {
+ /* first is the upper device itself */
+ ret = fn(udev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its upper devices */
+ ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
/**
* netdev_lower_get_next_private - Get the next ->private from the
@@ -5461,51 +5576,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
}
EXPORT_SYMBOL(netdev_lower_get_next);
-/**
- * netdev_all_lower_get_next - Get the next device from all lower neighbour list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RTNL lock or
- * its own locking that guarantees that the neighbour all lower
- * list will remain unchanged.
- */
-struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_entry(*iter, struct netdev_adjacent, list);
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
- if (&lower->list == &dev->all_adj_list.lower)
+ if (&lower->list == &dev->adj_list.lower)
return NULL;
- *iter = lower->list.next;
+ *iter = &lower->list;
return lower->dev;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next);
-/**
- * netdev_all_lower_get_next_rcu - Get the next device from all
- * lower neighbour list, RCU variant
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
- struct list_head **iter)
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+
+static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
- struct netdev_adjacent, list);
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
- return lower ? lower->dev : NULL;
+ return 0;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
@@ -5587,7 +5741,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
- adj->ref_nr++;
+ adj->ref_nr += 1;
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+ dev->name, adj_dev->name, adj->ref_nr);
+
return 0;
}
@@ -5601,8 +5758,8 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->private = private;
dev_hold(adj_dev);
- pr_debug("dev_hold for %s, because of link added from %s to %s\n",
- adj_dev->name, dev->name, adj_dev->name);
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
+ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
@@ -5636,22 +5793,28 @@ free_adj:
static void __netdev_adjacent_dev_remove(struct net_device *dev,
struct net_device *adj_dev,
+ u16 ref_nr,
struct list_head *dev_list)
{
struct netdev_adjacent *adj;
+ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+ dev->name, adj_dev->name, ref_nr);
+
adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
- pr_err("tried to remove device %s from %s\n",
+ pr_err("Adjacency does not exist for device %s from %s\n",
dev->name, adj_dev->name);
- BUG();
+ WARN_ON(1);
+ return;
}
- if (adj->ref_nr > 1) {
- pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
- adj->ref_nr-1);
- adj->ref_nr--;
+ if (adj->ref_nr > ref_nr) {
+ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+ dev->name, adj_dev->name, ref_nr,
+ adj->ref_nr - ref_nr);
+ adj->ref_nr -= ref_nr;
return;
}
@@ -5662,7 +5825,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
- pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
@@ -5676,73 +5839,45 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
{
int ret;
- ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
- master);
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
+ private, master);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
- false);
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
+ private, false);
if (ret) {
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
return ret;
}
return 0;
}
-static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev)
-{
- return __netdev_adjacent_dev_link_lists(dev, upper_dev,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower,
- NULL, false);
-}
-
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
+ u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list)
{
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
- __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
-}
-
-static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
-{
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower);
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
- int ret = __netdev_adjacent_dev_link(dev, upper_dev);
-
- if (ret)
- return ret;
-
- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
- &dev->adj_list.upper,
- &upper_dev->adj_list.lower,
- private, master);
- if (ret) {
- __netdev_adjacent_dev_unlink(dev, upper_dev);
- return ret;
- }
-
- return 0;
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
- __netdev_adjacent_dev_unlink(dev, upper_dev);
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
}
@@ -5752,7 +5887,6 @@ static int __netdev_upper_dev_link(struct net_device *dev,
void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j, *to_i, *to_j;
int ret = 0;
ASSERT_RTNL();
@@ -5761,10 +5895,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
+ if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
- if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
+ if (netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
@@ -5786,80 +5920,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
return ret;
- /* Now that we linked these devs, make all the upper_dev's
- * all_adj_list.upper visible to every dev's all_adj_list.lower an
- * versa, and don't forget the devices itself. All of these
- * links are non-neighbours.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- pr_debug("Interlinking %s with %s, non-neighbour\n",
- i->dev->name, j->dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, j->dev);
- if (ret)
- goto rollback_mesh;
- }
- }
-
- /* add dev to every upper_dev's upper device */
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- pr_debug("linking %s's upper device %s with %s\n",
- upper_dev->name, i->dev->name, dev->name);
- ret = __netdev_adjacent_dev_link(dev, i->dev);
- if (ret)
- goto rollback_upper_mesh;
- }
-
- /* add upper_dev to every dev's lower device */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- pr_debug("linking %s's lower device %s with %s\n", dev->name,
- i->dev->name, upper_dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
- if (ret)
- goto rollback_lower_mesh;
- }
-
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
- goto rollback_lower_mesh;
+ goto rollback;
return 0;
-rollback_lower_mesh:
- to_i = i;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
- }
-
- i = NULL;
-
-rollback_upper_mesh:
- to_i = i;
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(dev, i->dev);
- }
-
- i = j = NULL;
-
-rollback_mesh:
- to_i = i;
- to_j = j;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i && j == to_j)
- break;
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
- }
- if (i == to_i)
- break;
- }
-
+rollback:
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
return ret;
@@ -5916,7 +5985,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j;
ASSERT_RTNL();
changeupper_info.upper_dev = upper_dev;
@@ -5928,23 +5996,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
- /* Here is the tricky part. We must remove all dev's lower
- * devices from all upper_dev's upper devices and vice
- * versa, to maintain the graph relationship.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
-
- /* remove also the devices itself from lower/upper device
- * list
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
-
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(dev, i->dev);
-
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
}
@@ -6482,9 +6533,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive. */
- if (new_mtu < 0)
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
+ dev->name, new_mtu, dev->min_mtu);
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
+ dev->name, new_mtu, dev->max_mtu);
return -EINVAL;
+ }
if (!netif_device_present(dev))
return -ENODEV;
@@ -6631,26 +6691,42 @@ EXPORT_SYMBOL(dev_change_proto_down);
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @fd: new program fd or negative value to clear
+ * @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
-int dev_change_xdp_fd(struct net_device *dev, int fd)
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp = {};
+ struct netdev_xdp xdp;
int err;
+ ASSERT_RTNL();
+
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
+ if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0)
+ return err;
+ if (xdp.prog_attached)
+ return -EBUSY;
+ }
+
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ memset(&xdp, 0, sizeof(xdp));
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
+
err = ops->ndo_xdp(dev, &xdp);
if (err < 0 && prog)
bpf_prog_put(prog);
@@ -6723,8 +6799,8 @@ static void rollback_registered_many(struct list_head *head)
unlist_netdevice(dev);
dev->reg_state = NETREG_UNREGISTERING;
- on_each_cpu(flush_backlog, dev, 1);
}
+ flush_all_backlogs();
synchronize_net();
@@ -6759,6 +6835,7 @@ static void rollback_registered_many(struct list_head *head)
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
@@ -7637,16 +7714,17 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
- INIT_LIST_HEAD(&dev->all_adj_list.upper);
- INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
+#ifdef CONFIG_NET_SCHED
+ hash_init(dev->qdisc_hash);
+#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
- dev->tx_queue_len = 1;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
@@ -7927,18 +8005,13 @@ out:
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
-static int dev_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *ocpu)
+static int dev_cpu_dead(unsigned int oldcpu)
{
struct sk_buff **list_skb;
struct sk_buff *skb;
- unsigned int cpu, oldcpu = (unsigned long)ocpu;
+ unsigned int cpu;
struct softnet_data *sd, *oldsd;
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
@@ -7988,10 +8061,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
input_queue_head_incr(oldsd);
}
- return NOTIFY_OK;
+ return 0;
}
-
/**
* netdev_increment_features - increment feature set by one
* @all: current feature set
@@ -8286,8 +8358,11 @@ static int __init net_dev_init(void)
*/
for_each_possible_cpu(i) {
+ struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
+ INIT_WORK(flush, flush_backlog);
+
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
INIT_LIST_HEAD(&sd->poll_list);
@@ -8322,7 +8397,9 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
- hotcpu_notifier(dev_cpu_callback, 0);
+ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+ NULL, dev_cpu_dead);
+ WARN_ON(rc < 0);
dst_subsys_init();
rc = 0;
out:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1b5063088f1a..2b5bf9efa720 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
mutex_unlock(&devlink_mutex);
}
-static struct genl_family devlink_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = DEVLINK_GENL_NAME,
- .version = DEVLINK_GENL_VERSION,
- .maxattr = DEVLINK_ATTR_MAX,
- .netnsok = true,
- .pre_doit = devlink_nl_pre_doit,
- .post_doit = devlink_nl_post_doit,
-};
+static struct genl_family devlink_nl_family;
enum devlink_multicast_groups {
DEVLINK_MCGRP_CONFIG,
@@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink,
if (devlink->ops && devlink->ops->port_type_set) {
if (port_type == DEVLINK_PORT_TYPE_NOTSET)
return -EINVAL;
+ if (port_type == devlink_port->type)
+ return 0;
err = devlink->ops->port_type_set(devlink_port, port_type);
if (err)
return err;
@@ -1400,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
- u32 seq, int flags, u16 mode)
+ u32 seq, int flags)
{
+ const struct devlink_ops *ops = devlink->ops;
void *hdr;
+ int err = 0;
+ u16 mode;
+ u8 inline_mode;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out;
- if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
- goto nla_put_failure;
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto out;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto out;
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto out;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto out;
+ }
genlmsg_end(msg, hdr);
return 0;
-nla_put_failure:
+out:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
@@ -1428,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct sk_buff *msg;
- u16 mode;
int err;
if (!ops || !ops->eswitch_mode_get)
return -EOPNOTSUPP;
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- return err;
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
- info->snd_portid, info->snd_seq, 0, mode);
+ info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
@@ -1459,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
u16 mode;
+ u8 inline_mode;
+ int err = 0;
- if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
- return -EINVAL;
+ if (!ops)
+ return -EOPNOTSUPP;
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = ops->eswitch_mode_set(devlink, mode);
+ if (err)
+ return err;
+ }
- if (ops && ops->eswitch_mode_set)
- return ops->eswitch_mode_set(devlink, mode);
- return -EOPNOTSUPP;
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(
+ info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -1484,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
[DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1618,6 +1644,20 @@ static const struct genl_ops devlink_nl_ops[] = {
},
};
+static struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+ .module = THIS_MODULE,
+ .ops = devlink_nl_ops,
+ .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
/**
* devlink_alloc - Allocate new devlink instance resources
*
@@ -1840,9 +1880,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
static int __init devlink_module_init(void)
{
- return genl_register_family_with_ops_groups(&devlink_nl_family,
- devlink_nl_ops,
- devlink_nl_mcgrps);
+ return genl_register_family(&devlink_nl_family);
}
static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index d6b3b579560d..fb55327dcfea 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,12 +59,7 @@ struct dm_hw_stat_delta {
unsigned long last_drop_val;
};
-static struct genl_family net_drop_monitor_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "NET_DM",
- .version = 2,
-};
+static struct genl_family net_drop_monitor_family;
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
@@ -80,6 +75,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
struct nlattr *nla;
struct sk_buff *skb;
unsigned long flags;
+ void *msg_header;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
@@ -87,25 +83,45 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
skb = genlmsg_new(al, GFP_KERNEL);
- if (skb) {
- genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
- 0, NET_DM_CMD_ALERT);
- nla = nla_reserve(skb, NLA_UNSPEC,
- sizeof(struct net_dm_alert_msg));
- msg = nla_data(nla);
- memset(msg, 0, al);
- } else {
- mod_timer(&data->send_timer, jiffies + HZ / 10);
+ if (!skb)
+ goto err;
+
+ msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ if (!msg_header) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ if (!nla) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
}
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ goto out;
+err:
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+out:
spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
spin_unlock_irqrestore(&data->lock, flags);
+ if (skb) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh);
+
+ genlmsg_end(skb, genlmsg_data(gnlh));
+ }
+
return skb;
}
-static struct genl_multicast_group dropmon_mcgrps[] = {
+static const struct genl_multicast_group dropmon_mcgrps[] = {
{ .name = "events", },
};
@@ -351,6 +367,17 @@ static const struct genl_ops dropmon_ops[] = {
},
};
+static struct genl_family net_drop_monitor_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+ .module = THIS_MODULE,
+ .ops = dropmon_ops,
+ .n_ops = ARRAY_SIZE(dropmon_ops),
+ .mcgrps = dropmon_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
+};
+
static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
@@ -367,8 +394,7 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
- dropmon_ops, dropmon_mcgrps);
+ rc = genl_register_family(&net_drop_monitor_family);
if (rc) {
pr_err("Could not create drop monitor netlink family\n");
return rc;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 977489820eb9..d92de0a1f0a4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -119,6 +119,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
[ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
};
+static const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -227,6 +233,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_TUNABLES)
return ARRAY_SIZE(tunable_strings);
+ if (sset == ETH_SS_PHY_TUNABLES)
+ return ARRAY_SIZE(phy_tunable_strings);
+
if (sset == ETH_SS_PHY_STATS) {
if (dev->phydev)
return phy_get_sset_count(dev->phydev);
@@ -253,6 +262,8 @@ static void __ethtool_get_strings(struct net_device *dev,
sizeof(rss_hash_func_strings));
else if (stringset == ETH_SS_TUNABLES)
memcpy(data, tunable_strings, sizeof(tunable_strings));
+ else if (stringset == ETH_SS_PHY_TUNABLES)
+ memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
else if (stringset == ETH_SS_PHY_STATS) {
struct phy_device *phydev = dev->phydev;
@@ -1394,9 +1405,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = vzalloc(reglen);
- if (reglen && !regbuf)
- return -ENOMEM;
+ regbuf = NULL;
+ if (reglen) {
+ regbuf = vzalloc(reglen);
+ if (!regbuf)
+ return -ENOMEM;
+ }
ops->get_regs(dev, &regs, regbuf);
@@ -1701,7 +1715,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels, max;
+ struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
u32 max_rx_in_use = 0;
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
@@ -2422,6 +2436,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
};
}
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_PHY_DOWNSHIFT:
+ if (tuna->len != sizeof(u8) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U8)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+
+out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2479,6 +2572,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GET_TS_INFO:
case ETHTOOL_GEEE:
case ETHTOOL_GTUNABLE:
+ case ETHTOOL_PHY_GTUNABLE:
+ case ETHTOOL_GLINKSETTINGS:
break;
default:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -2684,6 +2779,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SLINKSETTINGS:
rc = ethtool_set_link_ksettings(dev, useraddr);
break;
+ case ETHTOOL_PHY_GTUNABLE:
+ rc = get_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_STUNABLE:
+ rc = set_phy_tunable(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..b6791d94841d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range))
+ goto errout;
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
(rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size_64bit(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->tun_id &&
nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
(rule->l3mdev &&
- nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index cb06aceb512a..1969b3f118c1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -39,7 +40,7 @@
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
@@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
return -ENOMEM;
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
@@ -94,14 +99,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
}
EXPORT_SYMBOL(sk_filter_trim_cap);
-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
{
- return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+ return skb_get_poff(skb);
}
-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
struct nlattr *nla;
if (skb_is_nonlinear(skb))
@@ -120,9 +124,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return 0;
}
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
struct nlattr *nla;
if (skb_is_nonlinear(skb))
@@ -145,7 +148,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return 0;
}
-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_0(__get_raw_cpu_id)
{
return raw_smp_processor_id();
}
@@ -233,9 +236,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
case SKF_AD_OFF + SKF_AD_HATYPE:
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
- BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
BPF_REG_TMP, BPF_REG_CTX,
offsetof(struct sk_buff, dev));
/* if (tmp != 0) goto pc + 1 */
@@ -1350,17 +1352,26 @@ struct bpf_scratchpad {
static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
+static inline int __bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ return skb_ensure_writable(skb, write_len);
+}
+
static inline int bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err;
+ int err = __bpf_try_make_writable(skb, write_len);
- err = skb_ensure_writable(skb, write_len);
bpf_compute_data_end(skb);
-
return err;
}
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+ return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
if (skb_at_tc_ingress(skb))
@@ -1373,12 +1384,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}
-static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- unsigned int offset = (unsigned int) r2;
- void *from = (void *) (long) r3;
- unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
@@ -1413,12 +1421,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+ void *, to, u32, len)
{
- const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
- unsigned int offset = (unsigned int) r2;
- void *to = (void *)(unsigned long) r3;
- unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(offset > 0xffff))
@@ -1446,10 +1451,31 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_STACK_SIZE,
};
-static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+ .func = bpf_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- unsigned int offset = (unsigned int) r2;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
@@ -1491,12 +1517,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
- unsigned int offset = (unsigned int) r2;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
@@ -1544,12 +1569,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+ __be32 *, to, u32, to_size, __wsum, seed)
{
struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
- u64 diff_size = from_size + to_size;
- __be32 *from = (__be32 *) (long) r1;
- __be32 *to = (__be32 *) (long) r3;
+ u32 diff_size = from_size + to_size;
int i, j = 0;
/* This is quite flexible, some examples:
@@ -1575,6 +1599,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
static const struct bpf_func_proto bpf_csum_diff_proto = {
.func = bpf_csum_diff,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1583,11 +1608,44 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.arg5_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+ /* The interface is to be used in combination with bpf_csum_diff()
+ * for direct packet writes. csum rotation for alignment as well
+ * as emulating csum_sub() can be done from the eBPF program.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ return (skb->csum = csum_add(skb->csum, csum));
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+ .func = bpf_csum_update,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
}
+static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ int ret = ____dev_forward_skb(dev, skb);
+
+ if (likely(!ret)) {
+ skb->dev = dev;
+ ret = netif_rx(skb);
+ }
+
+ return ret;
+}
+
static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret;
@@ -1607,10 +1665,55 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
-static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ /* skb->mac_len is not set on normal egress */
+ unsigned int mlen = skb->network_header - skb->mac_header;
+
+ __skb_pull(skb, mlen);
+
+ /* At ingress, the mac header has already been pulled once.
+ * At egress, skb_pospull_rcsum has to be done in case that
+ * the skb is originated from ingress (i.e. a forwarded skb)
+ * to ensure that rcsum starts at net header.
+ */
+ if (!skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+ skb_pop_mac_header(skb);
+ skb_reset_mac_len(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
+ bpf_push_mac_rcsum(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ if (dev_is_mac_header_xmit(dev))
+ return __bpf_redirect_common(skb, dev, flags);
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
+}
+
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct net_device *dev;
+ struct sk_buff *clone;
+ int ret;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return -EINVAL;
@@ -1619,14 +1722,22 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!dev))
return -EINVAL;
- skb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb))
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!clone))
return -ENOMEM;
- bpf_push_mac_rcsum(skb);
+ /* For direct write, we need to keep the invariant that the skbs
+ * we're dealing with need to be uncloned. Should uncloning fail
+ * here, we need to free the just generated clone to unclone once
+ * again.
+ */
+ ret = bpf_try_make_head_writable(skb);
+ if (unlikely(ret)) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
- return flags & BPF_F_INGRESS ?
- __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+ return __bpf_redirect(clone, dev, flags);
}
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1645,7 +1756,7 @@ struct redirect_info {
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
-static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
@@ -1670,10 +1781,7 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
- bpf_push_mac_rcsum(skb);
-
- return ri->flags & BPF_F_INGRESS ?
- __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+ return __bpf_redirect(skb, dev, ri->flags);
}
static const struct bpf_func_proto bpf_redirect_proto = {
@@ -1684,9 +1792,9 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
- return task_get_classid((struct sk_buff *) (unsigned long) r1);
+ return task_get_classid(skb);
}
static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
@@ -1696,9 +1804,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
- return dst_tclassid((struct sk_buff *) (unsigned long) r1);
+ return dst_tclassid(skb);
}
static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1708,14 +1816,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
/* If skb_clear_hash() was called due to mangling, we can
* trigger SW recalculation here. Later access to hash
* can then use the inline skb->hash via context directly
* instead of calling this helper again.
*/
- return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+ return skb_get_hash(skb);
}
static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
@@ -1725,10 +1833,25 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+ /* After all direct packet write, this can be used once for
+ * triggering a lazy recalc on next skb_get_hash() invocation.
+ */
+ skb_clear_hash(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+ .func = bpf_set_hash_invalid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+ u16, vlan_tci)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- __be16 vlan_proto = (__force __be16) r2;
int ret;
if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
@@ -1753,9 +1876,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
};
EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
-static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
int ret;
bpf_push_mac_rcsum(skb);
@@ -1930,10 +2052,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
return -ENOTSUPP;
}
-static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+ u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- __be16 proto = (__force __be16) r2;
int ret;
if (unlikely(flags))
@@ -1970,14 +2091,11 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = {
.arg3_type = ARG_ANYTHING,
};
-static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u32 pkt_type = r2;
-
/* We only allow a restricted subset to be changed for now. */
- if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
- pkt_type > PACKET_OTHERHOST))
+ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
+ !skb_pkt_type_ok(pkt_type)))
return -EINVAL;
skb->pkt_type = pkt_type;
@@ -1992,19 +2110,163 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
.arg2_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_skb_data(void *func)
+static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
- if (func == bpf_skb_vlan_push)
- return true;
- if (func == bpf_skb_vlan_pop)
- return true;
- if (func == bpf_skb_store_bytes)
- return true;
- if (func == bpf_skb_change_proto)
- return true;
- if (func == bpf_l3_csum_replace)
- return true;
- if (func == bpf_l4_csum_replace)
+ u32 min_len = skb_network_offset(skb);
+
+ if (skb_transport_header_was_set(skb))
+ min_len = skb_transport_offset(skb);
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ min_len = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ return min_len;
+}
+
+static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+{
+ return skb->dev->mtu + skb->dev->hard_header_len;
+}
+
+static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ unsigned int old_len = skb->len;
+ int ret;
+
+ ret = __skb_grow_rcsum(skb, new_len);
+ if (!ret)
+ memset(skb->data + old_len, 0, new_len - old_len);
+ return ret;
+}
+
+static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ return __skb_trim_rcsum(skb, new_len);
+}
+
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 min_len = __bpf_skb_min_len(skb);
+ int ret;
+
+ if (unlikely(flags || new_len > max_len || new_len < min_len))
+ return -EINVAL;
+ if (skb->encapsulation)
+ return -ENOTSUPP;
+
+ /* The basic idea of this helper is that it's performing the
+ * needed work to either grow or trim an skb, and eBPF program
+ * rewrites the rest via helpers like bpf_skb_store_bytes(),
+ * bpf_lX_csum_replace() and others rather than passing a raw
+ * buffer here. This one is a slow path helper and intended
+ * for replies with control messages.
+ *
+ * Like in bpf_skb_change_proto(), we want to keep this rather
+ * minimal and without protocol specifics so that we are able
+ * to separate concerns as in bpf_skb_store_bytes() should only
+ * be the one responsible for writing buffers.
+ *
+ * It's really expected to be a slow path operation here for
+ * control message replies, so we're implicitly linearizing,
+ * uncloning and drop offloads from the skb by this.
+ */
+ ret = __bpf_try_make_writable(skb, skb->len);
+ if (!ret) {
+ if (new_len > skb->len)
+ ret = bpf_skb_grow_rcsum(skb, new_len);
+ else if (new_len < skb->len)
+ ret = bpf_skb_trim_rcsum(skb, new_len);
+ if (!ret && skb_is_gso(skb))
+ skb_gso_reset(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_tail_proto = {
+ .func = bpf_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < xdp->data_hard_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_pull_data ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head)
return true;
return false;
@@ -2023,13 +2285,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
return 0;
}
-static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
- u64 meta_size)
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
{
- struct sk_buff *skb = (struct sk_buff *)(long) r1;
- struct bpf_map *map = (struct bpf_map *)(long) r2;
u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
- void *meta = (void *)(long) r4;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -2056,10 +2315,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags)
return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}
-static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+ u32, size, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
u8 compat[sizeof(struct bpf_tunnel_key)];
void *to_orig = to;
@@ -2124,10 +2382,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u8 *to = (u8 *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
int err;
@@ -2162,10 +2418,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
static struct metadata_dst __percpu *md_dst;
-static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+ const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
struct metadata_dst *md = this_cpu_ptr(md_dst);
u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
@@ -2183,7 +2438,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
*/
memcpy(compat, from, size);
memset(compat + size, 0, sizeof(compat) - size);
- from = (struct bpf_tunnel_key *)compat;
+ from = (const struct bpf_tunnel_key *) compat;
break;
default:
return -EINVAL;
@@ -2233,10 +2488,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+ const u8 *, from, u32, size)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u8 *from = (u8 *) (long) r2;
struct ip_tunnel_info *info = skb_tunnel_info(skb);
const struct metadata_dst *md = this_cpu_ptr(md_dst);
@@ -2282,28 +2536,24 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
}
-#ifdef CONFIG_SOCK_CGROUP_DATA
-static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+ u32, idx)
{
- struct sk_buff *skb = (struct sk_buff *)(long)r1;
- struct bpf_map *map = (struct bpf_map *)(long)r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct cgroup *cgrp;
struct sock *sk;
- u32 i = (u32)r3;
- sk = skb->sk;
+ sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return -ENOENT;
-
- if (unlikely(i >= array->map.max_entries))
+ if (unlikely(idx >= array->map.max_entries))
return -E2BIG;
- cgrp = READ_ONCE(array->ptrs[i]);
+ cgrp = READ_ONCE(array->ptrs[idx]);
if (unlikely(!cgrp))
return -EAGAIN;
- return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
+ return sk_under_cgroup_hierarchy(sk, cgrp);
}
static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
@@ -2314,7 +2564,38 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};
-#endif
+
+static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+ unsigned long off, unsigned long len)
+{
+ memcpy(dst_buff, src_buff + off, len);
+ return 0;
+}
+
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
+{
+ u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
+ bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_event_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
@@ -2330,6 +2611,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns:
@@ -2350,8 +2633,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
return &bpf_csum_diff_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -2368,6 +2655,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_change_proto_proto;
case BPF_FUNC_skb_change_type:
return &bpf_skb_change_type_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -2382,14 +2671,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
-#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
-#endif
default:
return sk_filter_func_proto(func_id);
}
@@ -2398,10 +2687,92 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id)
{
- return sk_filter_func_proto(func_id);
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_xdp_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_inout_func_proto(func_id);
+ }
}
-static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+static bool __is_valid_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
@@ -2435,7 +2806,103 @@ static bool sk_filter_is_valid_access(int off, int size,
}
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ if (off < 0 || off + size > sizeof(struct bpf_sock))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
+}
+
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!direct_write)
+ return 0;
+
+ /* if (!skb->cloned)
+ * goto start;
+ *
+ * (Fast-path, otherwise approximation that we might be
+ * a clone, do the rest in helper.)
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+ /* ret = bpf_skb_pull_data(skb, 0); */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_pull_data);
+ /* if (!ret)
+ * goto restore;
+ * return TC_ACT_SHOT;
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+ *insn++ = BPF_EXIT_INSN();
+
+ /* restore: */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ /* start: */
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
}
static bool tc_cls_act_is_valid_access(int off, int size,
@@ -2465,17 +2932,16 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
}
-static bool __is_valid_xdp_access(int off, int size,
- enum bpf_access_type type)
+static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
return false;
if (off % size != 0)
return false;
- if (size != 4)
+ if (size != sizeof(__u32))
return false;
return true;
@@ -2497,7 +2963,7 @@ static bool xdp_is_valid_access(int off, int size,
break;
}
- return __is_valid_xdp_access(off, size, type);
+ return __is_valid_xdp_access(off, size);
}
void bpf_warn_invalid_xdp_action(u32 act)
@@ -2506,10 +2972,10 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
-static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
@@ -2556,7 +3022,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
case offsetof(struct __sk_buff, ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
dst_reg, src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
@@ -2597,7 +3063,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
dst_reg, src_reg, insn);
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
prog->cb_access = 1;
@@ -2621,7 +3087,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
break;
case offsetof(struct __sk_buff, data):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
dst_reg, src_reg,
offsetof(struct sk_buff, data));
break;
@@ -2630,8 +3096,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
ctx_off -= offsetof(struct __sk_buff, data_end);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct bpf_skb_data_end, data_end);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
- dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
+ ctx_off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -2657,6 +3123,76 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+ int dst_reg, int src_reg,
+ int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct __sk_buff, ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+ default:
+ return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
+ ctx_off, insn_buf, prog);
+ }
+
+ return insn - insn_buf;
+}
+
static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
@@ -2666,12 +3202,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
switch (ctx_off) {
case offsetof(struct xdp_md, data):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
dst_reg, src_reg,
offsetof(struct xdp_buff, data));
break;
case offsetof(struct xdp_md, data_end):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
dst_reg, src_reg,
offsetof(struct xdp_buff, data_end));
break;
@@ -2683,13 +3219,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
.get_func_proto = tc_cls_act_func_proto,
.is_valid_access = tc_cls_act_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .convert_ctx_access = tc_cls_act_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
};
static const struct bpf_verifier_ops xdp_ops = {
@@ -2698,6 +3235,31 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_inout_ops = {
+ .get_func_proto = lwt_inout_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops cg_sock_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = sock_filter_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2718,12 +3280,42 @@ static struct bpf_prog_type_list xdp_type __read_mostly = {
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+ .ops = &lwt_xmit_ops,
+ .type = BPF_PROG_TYPE_LWT_XMIT,
+};
+
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+ .ops = &cg_sock_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SOCK
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
+ bpf_register_prog_type(&cg_sock_type);
+ bpf_register_prog_type(&lwt_in_type);
+ bpf_register_prog_type(&lwt_out_type);
+ bpf_register_prog_type(&lwt_xmit_type);
return 0;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 3937b1b68d5b..f765c11d8df5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -95,7 +95,6 @@ static void flow_cache_gc_task(struct work_struct *work)
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
flow_entry_kill(fce, xfrm);
atomic_dec(&xfrm->flow_cache_gc_count);
- WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
}
}
@@ -236,9 +235,8 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
if (fcp->hash_count > fc->high_watermark)
flow_cache_shrink(fc, fcp);
- if (fcp->hash_count > 2 * fc->high_watermark ||
- atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
- atomic_inc(&net->xfrm.flow_cache_genid);
+ if (atomic_read(&net->xfrm.flow_cache_gc_count) >
+ 2 * num_online_cpus() * fc->high_watermark) {
flo = ERR_PTR(-ENOBUFS);
goto ret_object;
}
@@ -419,28 +417,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
return 0;
}
-static int flow_cache_cpu(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache,
- hotcpu_notifier);
- int res, cpu = (unsigned long) hcpu;
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
+
+ return flow_cache_cpu_prepare(fc, cpu);
+}
+
+static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- res = flow_cache_cpu_prepare(fc, cpu);
- if (res)
- return notifier_from_errno(res);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- __flow_cache_shrink(fc, fcp, 0);
- break;
- }
- return NOTIFY_OK;
+ __flow_cache_shrink(fc, fcp, 0);
+ return 0;
}
int flow_cache_init(struct net *net)
@@ -467,18 +457,8 @@ int flow_cache_init(struct net *net)
if (!fc->percpu)
return -ENOMEM;
- cpu_notifier_register_begin();
-
- for_each_online_cpu(i) {
- if (flow_cache_cpu_prepare(fc, i))
- goto err;
- }
- fc->hotcpu_notifier = (struct notifier_block){
- .notifier_call = flow_cache_cpu,
- };
- __register_hotcpu_notifier(&fc->hotcpu_notifier);
-
- cpu_notifier_register_done();
+ if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
+ goto err;
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -494,8 +474,6 @@ err:
fcp->hash_table = NULL;
}
- cpu_notifier_register_done();
-
free_percpu(fc->percpu);
fc->percpu = NULL;
@@ -509,7 +487,8 @@ void flow_cache_fini(struct net *net)
struct flow_cache *fc = &net->xfrm.flow_cache_global;
del_timer_sync(&fc->rnd_timer);
- unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
for_each_possible_cpu(i) {
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
@@ -521,3 +500,14 @@ void flow_cache_fini(struct net *net)
fc->percpu = NULL;
}
EXPORT_SYMBOL(flow_cache_fini);
+
+void __init flow_cache_hp_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
+ "net/flow:prepare",
+ flow_cache_cpu_up_prep,
+ flow_cache_cpu_dead);
+ WARN_ON(ret < 0);
+}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 52742a02814f..1b7673aac59d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -6,6 +6,8 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/gre.h>
+#include <net/pptp.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
@@ -56,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
EXPORT_SYMBOL(skb_flow_dissector_init);
/**
+ * skb_flow_get_be16 - extract be16 entity
+ * @skb: sk_buff to extract from
+ * @poff: offset to extract at
+ * @data: raw buffer pointer to the packet
+ * @hlen: packet header length
+ *
+ * The function will try to retrieve a be32 entity at
+ * offset poff
+ */
+static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
+ void *data, int hlen)
+{
+ __be16 *u, _u;
+
+ u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
+ if (u)
+ return *u;
+
+ return 0;
+}
+
+/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
@@ -115,14 +139,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
+ struct flow_dissector_key_vlan *key_vlan;
struct flow_dissector_key_keyid *key_keyid;
+ bool skip_vlan = false;
u8 ip_proto = 0;
- bool ret = false;
+ bool ret;
if (!data) {
data = skb->data;
- proto = skb->protocol;
+ proto = skb_vlan_tag_present(skb) ?
+ skb->vlan_proto : skb->protocol;
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
}
@@ -242,22 +270,42 @@ ipv6:
case htons(ETH_P_8021Q): {
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
+ bool vlan_tag_present = skb && skb_vlan_tag_present(skb);
- vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
- if (!vlan)
- goto out_bad;
+ if (vlan_tag_present)
+ proto = skb->protocol;
+
+ if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
+ data, hlen, &_vlan);
+ if (!vlan)
+ goto out_bad;
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ if (skip_vlan)
+ goto again;
+ }
+ skip_vlan = true;
if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_VLANID)) {
- key_tags = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_VLANID,
+ FLOW_DISSECTOR_KEY_VLAN)) {
+ key_vlan = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_VLAN,
target_container);
- key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+ if (vlan_tag_present) {
+ key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
+ key_vlan->vlan_priority =
+ (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
+ } else {
+ key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
+ VLAN_VID_MASK;
+ key_vlan->vlan_priority =
+ (ntohs(vlan->h_vlan_TCI) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ }
}
- proto = vlan->h_vlan_encapsulated_proto;
- nhoff += sizeof(*vlan);
goto again;
}
case htons(ETH_P_PPP_SES): {
@@ -338,32 +386,42 @@ mpls:
ip_proto_again:
switch (ip_proto) {
case IPPROTO_GRE: {
- struct gre_hdr {
- __be16 flags;
- __be16 proto;
- } *hdr, _hdr;
+ struct gre_base_hdr *hdr, _hdr;
+ u16 gre_ver;
+ int offset = 0;
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
goto out_bad;
- /*
- * Only look inside GRE if version zero and no
- * routing
- */
- if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
+
+ /* Only look inside GRE without routing */
+ if (hdr->flags & GRE_ROUTING)
break;
- proto = hdr->proto;
- nhoff += 4;
+ /* Only look inside GRE for version 0 and 1 */
+ gre_ver = ntohs(hdr->flags & GRE_VERSION);
+ if (gre_ver > 1)
+ break;
+
+ proto = hdr->protocol;
+ if (gre_ver) {
+ /* Version1 must be PPTP, and check the flags */
+ if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
+ break;
+ }
+
+ offset += sizeof(struct gre_base_hdr);
+
if (hdr->flags & GRE_CSUM)
- nhoff += 4;
+ offset += sizeof(((struct gre_full_hdr *)0)->csum) +
+ sizeof(((struct gre_full_hdr *)0)->reserved1);
+
if (hdr->flags & GRE_KEY) {
const __be32 *keyid;
__be32 _keyid;
- keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+ keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid),
data, hlen, &_keyid);
-
if (!keyid)
goto out_bad;
@@ -372,32 +430,66 @@ ip_proto_again:
key_keyid = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_GRE_KEYID,
target_container);
- key_keyid->keyid = *keyid;
+ if (gre_ver == 0)
+ key_keyid->keyid = *keyid;
+ else
+ key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
}
- nhoff += 4;
+ offset += sizeof(((struct gre_full_hdr *)0)->key);
}
+
if (hdr->flags & GRE_SEQ)
- nhoff += 4;
- if (proto == htons(ETH_P_TEB)) {
- const struct ethhdr *eth;
- struct ethhdr _eth;
-
- eth = __skb_header_pointer(skb, nhoff,
- sizeof(_eth),
- data, hlen, &_eth);
- if (!eth)
+ offset += sizeof(((struct pptp_gre_header *)0)->seq);
+
+ if (gre_ver == 0) {
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, nhoff + offset,
+ sizeof(_eth),
+ data, hlen, &_eth);
+ if (!eth)
+ goto out_bad;
+ proto = eth->h_proto;
+ offset += sizeof(*eth);
+
+ /* Cap headers that we access via pointers at the
+ * end of the Ethernet header as our maximum alignment
+ * at that point is only 2 bytes.
+ */
+ if (NET_IP_ALIGN)
+ hlen = (nhoff + offset);
+ }
+ } else { /* version 1, must be PPTP */
+ u8 _ppp_hdr[PPP_HDRLEN];
+ u8 *ppp_hdr;
+
+ if (hdr->flags & GRE_ACK)
+ offset += sizeof(((struct pptp_gre_header *)0)->ack);
+
+ ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+ sizeof(_ppp_hdr),
+ data, hlen, _ppp_hdr);
+ if (!ppp_hdr)
goto out_bad;
- proto = eth->h_proto;
- nhoff += sizeof(*eth);
-
- /* Cap headers that we access via pointers at the
- * end of the Ethernet header as our maximum alignment
- * at that point is only 2 bytes.
- */
- if (NET_IP_ALIGN)
- hlen = nhoff;
+
+ switch (PPP_PROTOCOL(ppp_hdr)) {
+ case PPP_IP:
+ proto = htons(ETH_P_IP);
+ break;
+ case PPP_IPV6:
+ proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ /* Could probably catch some more like MPLS */
+ break;
+ }
+
+ offset += PPP_HDRLEN;
}
+ nhoff += offset;
key_control->flags |= FLOW_DIS_ENCAPSULATION;
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
goto out_good;
@@ -478,15 +570,28 @@ ip_proto_again:
data, hlen);
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP)) {
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+ key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
+ }
+
out_good:
ret = true;
-out_bad:
+ key_control->thoff = (u16)nhoff;
+out:
key_basic->n_proto = proto;
key_basic->ip_proto = ip_proto;
- key_control->thoff = (u16)nhoff;
return ret;
+
+out_bad:
+ ret = false;
+ key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
+ goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);
@@ -653,7 +758,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
struct flow_keys keys;
@@ -874,8 +979,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
.offset = offsetof(struct flow_keys, ports),
},
{
- .key_id = FLOW_DISSECTOR_KEY_VLANID,
- .offset = offsetof(struct flow_keys, tags),
+ .key_id = FLOW_DISSECTOR_KEY_VLAN,
+ .offset = offsetof(struct flow_keys, vlan),
},
{
.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
@@ -940,4 +1045,4 @@ static int __init init_default_flow_dissectors(void)
return 0;
}
-late_initcall_sync(init_default_flow_dissectors);
+core_initcall(init_default_flow_dissectors);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cad8e791f28e..0385dece1f6f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,13 +7,14 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
* names to make it usable in general net subsystem.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -30,165 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator
-{
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head
-{
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (!list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
-
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
-
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
{
- bool res;
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb..87f28557b329 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **rate_est)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(rate_est, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..b3eef90b2df9
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,397 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable is needed to protect per-cpu redirect_info between
+ * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+ * access to maps strictly require a rcu_read_lock() for protection,
+ * mixing with BH RCU lock doesn't work.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ bpf_compute_data_end(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 669ecc9f884e..c23465005f2f 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,6 +26,7 @@
#include <net/lwtunnel.h>
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
+#include <net/nexthop.h>
#ifdef CONFIG_MODULES
@@ -39,6 +40,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "MPLS";
case LWTUNNEL_ENCAP_ILA:
return "ILA";
+ case LWTUNNEL_ENCAP_SEG6:
+ return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -110,25 +115,91 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
+ ret = ops->build_state(dev, encap, family, cfg, lws);
+ if (ret)
+ module_put(ops->owner);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_valid_encap_type(u16 encap_type)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return ret;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
#ifdef CONFIG_MODULES
if (!ops) {
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- rcu_read_unlock();
+ __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
+ rtnl_lock();
+
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
}
}
#endif
- if (likely(ops && ops->build_state))
- ret = ops->build_state(dev, encap, family, cfg, lws);
- rcu_read_unlock();
+ return ops ? 0 : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(lwtunnel_valid_encap_type);
- return ret;
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
+{
+ struct rtnexthop *rtnh = (struct rtnexthop *)attr;
+ struct nlattr *nla_entype;
+ struct nlattr *attrs;
+ struct nlattr *nla;
+ u16 encap_type;
+ int attrlen;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+
+ if (nla_entype) {
+ encap_type = nla_get_u16(nla_entype);
+
+ if (lwtunnel_valid_encap_type(encap_type) != 0)
+ return -EOPNOTSUPP;
+ }
+ }
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(lwtunnel_build_state);
+EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr);
+
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+ const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+ if (ops->destroy_state) {
+ ops->destroy_state(lws);
+ kfree_rcu(lws, rcu);
+ } else {
+ kfree(lws);
+ }
+ module_put(ops->owner);
+}
+EXPORT_SYMBOL(lwtstate_free);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
{
@@ -251,6 +322,41 @@ drop:
}
EXPORT_SYMBOL(lwtunnel_output);
+int lwtunnel_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->xmit))
+ ret = ops->xmit(skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_xmit);
+
int lwtunnel_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index cf26e04c4046..e7c12caa20c8 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -100,6 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
neigh->parms->neigh_cleanup(neigh);
__neigh_notify(neigh, RTM_DELNEIGH, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
@@ -1148,7 +1149,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
} else
goto out;
} else {
- if (lladdr == neigh->ha && new == NUD_STALE)
+ if (lladdr == neigh->ha && new == NUD_STALE &&
+ !(flags & NEIGH_UPDATE_F_ADMIN))
new = old;
}
}
@@ -2290,13 +2292,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
- if (!net_eq(dev_net(n->dev), net))
- continue;
- if (neigh_ifindex_filtered(n->dev, filter_idx))
- continue;
- if (neigh_master_filtered(n->dev, filter_master_idx))
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter_idx) ||
+ neigh_master_filtered(n->dev, filter_master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2331,9 +2330,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (pneigh_net(n) != net)
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || pneigh_net(n) != net)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2926,7 +2923,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
return;
set_bit(index, p->data_state);
- call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ if (index == NEIGH_VAR_DELAY_PROBE_TIME)
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
if (!dev) /* NULL dev means this is default value */
neigh_copy_dflt_parms(net, p, index);
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6e4f34721080..b0c04cf4851d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -950,10 +950,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
+ struct kobject *kobj = &dev->_rx[i].kobj;
+
+ if (!list_empty(&dev_net(dev)->exit_list))
+ kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
- sysfs_remove_group(&dev->_rx[i].kobj,
- dev->sysfs_rx_queue_group);
- kobject_put(&dev->_rx[i].kobj);
+ sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ kobject_put(kobj);
}
return error;
@@ -1021,7 +1024,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
return sprintf(buf, "%lu", trans_timeout);
}
-#ifdef CONFIG_XPS
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -1033,6 +1035,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
+static ssize_t show_traffic_class(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ struct net_device *dev = queue->dev;
+ int index = get_netdev_queue_index(queue);
+ int tc = netdev_txq_to_tc(dev, index);
+
+ if (tc < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%u\n", tc);
+}
+
+#ifdef CONFIG_XPS
static ssize_t show_tx_maxrate(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
char *buf)
@@ -1075,6 +1092,9 @@ static struct netdev_queue_attribute queue_tx_maxrate =
static struct netdev_queue_attribute queue_trans_timeout =
__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+static struct netdev_queue_attribute queue_traffic_class =
+ __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
+
#ifdef CONFIG_BQL
/*
* Byte queue limits sysfs structures and functions.
@@ -1190,29 +1210,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute, char *buf)
{
struct net_device *dev = queue->dev;
+ int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
cpumask_var_t mask;
unsigned long index;
- int i, len;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
index = get_netdev_queue_index(queue);
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- for_each_possible_cpu(i) {
- struct xps_map *map =
- rcu_dereference(dev_maps->cpu_map[i]);
- if (map) {
- int j;
- for (j = 0; j < map->len; j++) {
- if (map->queues[j] == index) {
- cpumask_set_cpu(i, mask);
- break;
- }
+ for_each_possible_cpu(cpu) {
+ int i, tci = cpu * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ cpumask_set_cpu(cpu, mask);
+ break;
}
}
}
@@ -1260,6 +1289,7 @@ static struct netdev_queue_attribute xps_cpus_attribute =
static struct attribute *netdev_queue_default_attrs[] = {
&queue_trans_timeout.attr,
+ &queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
&queue_tx_maxrate.attr,
@@ -1340,6 +1370,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
+ if (!list_empty(&dev_net(dev)->exit_list))
+ queue->kobj.uevent_suppress = 1;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
#endif
@@ -1525,6 +1557,9 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &(ndev->dev);
+ if (!list_empty(&dev_net(ndev)->exit_list))
+ dev_set_uevent_suppress(dev, 1);
+
kobject_get(&dev->kobj);
remove_queue_kobjects(ndev);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 10608dd1489f..3c4bbec39713 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -37,6 +37,11 @@ struct net init_net = {
};
EXPORT_SYMBOL(init_net);
+static bool init_net_initialized;
+
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -44,27 +49,28 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
- size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+ unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->len = max_gen_ptrs;
+ ng->s.len = max_gen_ptrs;
return ng;
}
-static int net_assign_generic(struct net *net, int id, void *data)
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
- BUG_ON(id == 0);
+ BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&net_mutex));
- ng = old_ng;
- if (old_ng->len >= id)
- goto assign;
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
ng = net_alloc_generic();
if (ng == NULL)
@@ -81,12 +87,12 @@ static int net_assign_generic(struct net *net, int id, void *data)
* the old copy for kfree after a grace period.
*/
- memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
rcu_assign_pointer(net->gen, ng);
- kfree_rcu(old_ng, rcu);
-assign:
- ng->ptr[id - 1] = data;
+ kfree_rcu(old_ng, s.rcu);
return 0;
}
@@ -120,8 +126,7 @@ out:
static void ops_free(const struct pernet_operations *ops, struct net *net)
{
if (ops->id && ops->size) {
- int id = *ops->id;
- kfree(net_generic(net, id));
+ kfree(net_generic(net, *ops->id));
}
}
@@ -216,6 +221,8 @@ int peernet2id_alloc(struct net *net, struct net *peer)
bool alloc;
int id;
+ if (atomic_read(&net->count) == 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
spin_lock_bh(&net->nsid_lock);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
@@ -224,7 +231,6 @@ int peernet2id_alloc(struct net *net, struct net *peer)
rtnl_net_notifyid(net, RTM_NEWNSID, id);
return id;
}
-EXPORT_SYMBOL(peernet2id_alloc);
/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
@@ -236,6 +242,7 @@ int peernet2id(struct net *net, struct net *peer)
spin_unlock_bh(&net->nsid_lock);
return id;
}
+EXPORT_SYMBOL(peernet2id);
/* This function returns true is the peer netns has an id assigned into the
* current netns.
@@ -307,6 +314,16 @@ out_undo:
#ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;
@@ -348,19 +365,34 @@ void net_drop_ns(void *p)
struct net *copy_net_ns(unsigned long flags,
struct user_namespace *user_ns, struct net *old_net)
{
+ struct ucounts *ucounts;
struct net *net;
int rv;
if (!(flags & CLONE_NEWNET))
return get_net(old_net);
+ ucounts = inc_net_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
net = net_alloc();
- if (!net)
+ if (!net) {
+ dec_net_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
+ }
get_user_ns(user_ns);
- mutex_lock(&net_mutex);
+ rv = mutex_lock_killable(&net_mutex);
+ if (rv < 0) {
+ net_free(net);
+ dec_net_namespaces(ucounts);
+ put_user_ns(user_ns);
+ return ERR_PTR(rv);
+ }
+
+ net->ucounts = ucounts;
rv = setup_net(net, user_ns);
if (rv == 0) {
rtnl_lock();
@@ -369,6 +401,7 @@ struct net *copy_net_ns(unsigned long flags,
}
mutex_unlock(&net_mutex);
if (rv < 0) {
+ dec_net_namespaces(ucounts);
put_user_ns(user_ns);
net_drop_ns(net);
return ERR_PTR(rv);
@@ -441,6 +474,7 @@ static void cleanup_net(struct work_struct *work)
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ dec_net_namespaces(net->ucounts);
put_user_ns(net->user_ns);
net_drop_ns(net);
}
@@ -528,7 +562,7 @@ static struct pernet_operations __net_initdata net_ns_ops = {
.exit = net_ns_net_exit,
};
-static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
+static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
[NETNSA_NONE] = { .type = NLA_UNSPEC },
[NETNSA_NSID] = { .type = NLA_S32 },
[NETNSA_PID] = { .type = NLA_U32 },
@@ -745,6 +779,8 @@ static int __init net_ns_init(void)
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
+ init_net_initialized = true;
+
rtnl_lock();
list_add_tail_rcu(&init_net.list, &net_namespace_list);
rtnl_unlock();
@@ -806,15 +842,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
+ if (!init_net_initialized) {
+ list_add_tail(&ops->list, list);
+ return 0;
+ }
+
return ops_init(ops, &init_net);
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- LIST_HEAD(net_exit_list);
- list_add(&init_net.exit_list, &net_exit_list);
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ if (!init_net_initialized) {
+ list_del(&ops->list);
+ } else {
+ LIST_HEAD(net_exit_list);
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+ }
}
#endif /* CONFIG_NET_NS */
@@ -828,7 +873,7 @@ static int register_pernet_operations(struct list_head *list,
if (ops->id) {
again:
- error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
if (error < 0) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -836,7 +881,7 @@ again:
}
return error;
}
- max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+ max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
}
error = __register_pernet_operations(list, ops);
if (error) {
@@ -991,11 +1036,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+ return to_net_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations netns_operations = {
.name = "net",
.type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
+ .owner = netns_owner,
};
#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 53599bd0c82d..9424673009c1 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -171,12 +171,12 @@ static void poll_one_napi(struct napi_struct *napi)
static void poll_napi(struct net_device *dev)
{
struct napi_struct *napi;
+ int cpu = smp_processor_id();
list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner != smp_processor_id() &&
- spin_trylock(&napi->poll_lock)) {
+ if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
poll_one_napi(napi);
- spin_unlock(&napi->poll_lock);
+ smp_store_release(&napi->poll_owner, -1);
}
}
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index bbd118b19aef..8e69ce472236 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -216,8 +216,8 @@
#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
-#define if_lock(t) spin_lock(&(t->if_lock));
-#define if_unlock(t) spin_unlock(&(t->if_lock));
+#define if_lock(t) mutex_lock(&(t->if_lock));
+#define if_unlock(t) mutex_unlock(&(t->if_lock));
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
@@ -413,7 +413,7 @@ struct pktgen_hdr {
};
-static int pg_net_id __read_mostly;
+static unsigned int pg_net_id __read_mostly;
struct pktgen_net {
struct net *net;
@@ -423,7 +423,7 @@ struct pktgen_net {
};
struct pktgen_thread {
- spinlock_t if_lock; /* for list of devices */
+ struct mutex if_lock; /* for list of devices */
struct list_head if_list; /* All device here */
struct list_head th_list;
struct task_struct *tsk;
@@ -2010,11 +2010,13 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
{
struct pktgen_thread *t;
+ mutex_lock(&pktgen_thread_lock);
+
list_for_each_entry(t, &pn->pktgen_threads, th_list) {
struct pktgen_dev *pkt_dev;
- rcu_read_lock();
- list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
if (pkt_dev->odev != dev)
continue;
@@ -2029,8 +2031,9 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
dev->name);
break;
}
- rcu_read_unlock();
+ if_unlock(t);
}
+ mutex_unlock(&pktgen_thread_lock);
}
static int pktgen_device_event(struct notifier_block *unused,
@@ -2286,7 +2289,7 @@ out:
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
{
- pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev);
+ pkt_dev->pkt_overhead = 0;
pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
@@ -2777,13 +2780,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
- struct pktgen_dev *pkt_dev,
- unsigned int extralen)
+ struct pktgen_dev *pkt_dev)
{
+ unsigned int extralen = LL_RESERVED_SPACE(dev);
struct sk_buff *skb = NULL;
- unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen +
- pkt_dev->pkt_overhead;
+ unsigned int size;
+ size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead;
if (pkt_dev->flags & F_NODE) {
int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
@@ -2796,8 +2799,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
+ /* the caller pre-fetches from skb->data and reserves for the mac hdr */
if (likely(skb))
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, extralen - 16);
return skb;
}
@@ -2830,16 +2834,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- datalen = (odev->hard_header_len + 16) & ~0xf;
-
- skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
prefetchw(skb->data);
- skb_reserve(skb, datalen);
+ skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
eth = (__u8 *) skb_push(skb, 14);
@@ -2959,7 +2961,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- skb = pktgen_alloc_skb(odev, pkt_dev, 16);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
@@ -3763,7 +3765,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
return -ENOMEM;
}
- spin_lock_init(&t->if_lock);
+ mutex_init(&t->if_lock);
t->cpu = cpu;
INIT_LIST_HEAD(&t->if_list);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 189cc78c77eb..75e3ea7bda08 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -40,7 +40,7 @@
#include <linux/pci.h>
#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -275,6 +275,7 @@ int rtnl_unregister(int protocol, int msgtype)
rtnl_msg_handlers[protocol][msgindex].doit = NULL;
rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
+ rtnl_msg_handlers[protocol][msgindex].calcit = NULL;
return 0;
}
@@ -704,6 +705,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
} else if (i == RTAX_FEATURES - 1) {
u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+ if (!user_features)
+ continue;
BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
if (nla_put_u32(skb, i + 1, user_features))
goto nla_put_failure;
@@ -837,15 +840,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
(ext_filter_mask & RTEXT_FILTER_VF)) {
int num_vfs = dev_num_vf(dev->dev.parent);
- size_t size = nla_total_size(sizeof(struct nlattr));
- size += nla_total_size(num_vfs * sizeof(struct nlattr));
+ size_t size = nla_total_size(0);
size += num_vfs *
- (nla_total_size(sizeof(struct ifla_vf_mac)) +
+ (nla_total_size(0) +
+ nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct ifla_vf_vlan_info)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ nla_total_size(0) + /* nest IFLA_VF_STATS */
/* IFLA_VF_STATS_RX_PACKETS */
nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_PACKETS */
@@ -893,7 +901,8 @@ static size_t rtnl_port_size(const struct net_device *dev,
static size_t rtnl_xdp_size(const struct net_device *dev)
{
- size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */
+ size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
+ nla_total_size(1); /* XDP_ATTACHED */
if (!dev->netdev_ops->ndo_xdp)
return 0;
@@ -922,8 +931,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
- + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
- + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1109,14 +1118,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct nlattr *vfinfo)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct nlattr *vf, *vfstats, *vfvlanlist;
struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_vlan_info vf_vlan_info;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_stats vf_stats;
struct ifla_vf_trust vf_trust;
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
- struct nlattr *vf, *vfstats;
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
@@ -1133,11 +1143,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
ivi.linkstate = 0;
+ /* VLAN Protocol by default is 802.1Q */
+ ivi.vlan_proto = htons(ETH_P_8021Q);
if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
return 0;
+ memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+
vf_mac.vf =
vf_vlan.vf =
+ vf_vlan_info.vf =
vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
@@ -1148,6 +1163,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
+ vf_vlan_info.vlan = ivi.vlan;
+ vf_vlan_info.qos = ivi.qos;
+ vf_vlan_info.vlan_proto = ivi.vlan_proto;
vf_tx_rate.rate = ivi.max_tx_rate;
vf_rate.min_tx_rate = ivi.min_tx_rate;
vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1156,10 +1174,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_rss_query_en.setting = ivi.rss_query_en;
vf_trust.setting = ivi.trusted;
vf = nla_nest_start(skb, IFLA_VF_INFO);
- if (!vf) {
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vf)
+ goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1175,17 +1191,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
&vf_rss_query_en) ||
nla_put(skb, IFLA_VF_TRUST,
sizeof(vf_trust), &vf_trust))
- return -EMSGSIZE;
+ goto nla_put_vf_failure;
+ vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+ if (!vfvlanlist)
+ goto nla_put_vf_failure;
+ if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+ &vf_vlan_info)) {
+ nla_nest_cancel(skb, vfvlanlist);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfvlanlist);
memset(&vf_stats, 0, sizeof(vf_stats));
if (dev->netdev_ops->ndo_get_vf_stats)
dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
&vf_stats);
vfstats = nla_nest_start(skb, IFLA_VF_STATS);
- if (!vfstats) {
- nla_nest_cancel(skb, vf);
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vfstats)
+ goto nla_put_vf_failure;
if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1197,11 +1219,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD))
- return -EMSGSIZE;
+ vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
return 0;
+
+nla_put_vf_failure:
+ nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
}
static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1446,6 +1476,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
[IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
[IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
@@ -1474,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
};
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -1578,7 +1610,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
head = &net->dev_index_head[h];
hlist_for_each_entry(dev, head, index_hlist) {
if (link_dump_filtered(dev, master_idx, kind_ops))
- continue;
+ goto cont;
if (idx < s_idx)
goto cont;
err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
@@ -1702,7 +1734,37 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
- ivv->qos);
+ ivv->qos,
+ htons(ETH_P_8021Q));
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN_LIST]) {
+ struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+ struct nlattr *attr;
+ int rem, len = 0;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_vlan)
+ return err;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ return -EINVAL;
+ }
+ if (len >= MAX_VLAN_LIST_LEN)
+ return -EOPNOTSUPP;
+ ivvl[len] = nla_data(attr);
+
+ len++;
+ }
+ if (len == 0)
+ return -EINVAL;
+
+ err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+ ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
return err;
}
@@ -2103,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_XDP]) {
struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
ifla_xdp_policy);
@@ -2113,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
if (xdp[IFLA_XDP_FD]) {
err = dev_change_xdp_fd(dev,
- nla_get_s32(xdp[IFLA_XDP_FD]));
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ xdp_flags);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2676,7 +2749,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
ext_filter_mask));
}
- return min_ifinfo_dump_size;
+ return nlmsg_total_size(min_ifinfo_dump_size);
}
static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2791,7 +2864,10 @@ nla_put_failure:
static inline size_t rtnl_fdb_nlmsg_size(void)
{
- return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) +
+ nla_total_size(ETH_ALEN) + /* NDA_LLADDR */
+ nla_total_size(sizeof(u16)) + /* NDA_VLAN */
+ 0;
}
static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
@@ -3066,7 +3142,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[0])
+ if (*idx < cb->args[2])
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -3093,19 +3169,18 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev,
- int idx)
+ int *idx)
{
int err;
netif_addr_lock_bh(dev);
- err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
goto out;
- nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
- cb->args[1] = err;
- return idx;
+ return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -3118,9 +3193,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
const struct net_device_ops *cops = NULL;
struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
struct net *net = sock_net(skb->sk);
+ struct hlist_head *head;
int brport_idx = 0;
int br_idx = 0;
- int idx = 0;
+ int h, s_h;
+ int idx = 0, s_idx;
+ int err = 0;
+ int fidx = 0;
if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
ifla_policy) == 0) {
@@ -3138,49 +3217,71 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- cb->args[1] = 0;
- for_each_netdev(net, dev) {
- if (brport_idx && (dev->ifindex != brport_idx))
- continue;
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
- if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
- } else {
- if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
+ if (brport_idx && (dev->ifindex != brport_idx))
continue;
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !(dev->priv_flags & IFF_EBRIDGE))
- continue;
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+ } else {
+ if (dev != br_dev &&
+ !(dev->priv_flags & IFF_BRIDGE_PORT))
+ continue;
- cops = ops;
- }
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !(dev->priv_flags & IFF_EBRIDGE))
+ continue;
+ cops = ops;
+ }
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- if (cops && cops->ndo_fdb_dump)
- idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
- idx);
- }
- if (cb->args[1] == -EMSGSIZE)
- break;
+ if (idx < s_idx)
+ goto cont;
- if (dev->netdev_ops->ndo_fdb_dump)
- idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
- idx);
- else
- idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
- if (cb->args[1] == -EMSGSIZE)
- break;
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb,
+ br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ goto out;
+ }
+ }
+
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
+ dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ if (err == -EMSGSIZE)
+ goto out;
+
+ cops = NULL;
- cops = NULL;
+ /* reset fdb offset to 0 for rest of the interfaces */
+ cb->args[2] = 0;
+ fidx = 0;
+cont:
+ idx++;
+ }
}
- cb->args[0] = idx;
+out:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = fidx;
+
return skb->len;
}
@@ -3550,6 +3651,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+ switch (attr_id) {
+ case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+ return sizeof(struct rtnl_link_stats64);
+ }
+
+ return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+ int *prividx)
+{
+ struct nlattr *attr = NULL;
+ int attr_id, size;
+ void *attr_data;
+ int err;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return -ENODATA;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (attr_id < *prividx)
+ continue;
+
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (!size)
+ continue;
+
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
+ continue;
+
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+ attr_data);
+ if (err)
+ goto get_offload_stats_failure;
+ }
+
+ if (!attr)
+ return -ENODATA;
+
+ *prividx = 0;
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+get_offload_stats_failure:
+ *prividx = attr_id;
+ return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+ int nla_size = 0;
+ int attr_id;
+ int size;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return 0;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
+ continue;
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (nla_size != 0)
+ nla_size += nla_total_size(0);
+
+ return nla_size;
+}
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, unsigned int filter_mask,
@@ -3559,6 +3745,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
struct nlmsghdr *nlh;
struct nlattr *attr;
int s_prividx = *prividx;
+ int err;
ASSERT_RTNL();
@@ -3587,8 +3774,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS);
@@ -3612,8 +3797,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (master)
ops = master->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3628,6 +3811,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+ *idxattr)) {
+ *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+ attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = rtnl_get_offload_stats(skb, dev, prividx);
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, attr);
+ else
+ nla_nest_end(skb, attr);
+
+ if (err && err != -ENODATA)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -3642,10 +3843,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
- [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) },
-};
-
static size_t if_nlmsg_stats_size(const struct net_device *dev,
u32 filter_mask)
{
@@ -3685,6 +3882,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+ size += rtnl_get_offload_stats_size(dev);
+
return size;
}
@@ -3698,6 +3898,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh)
u32 filter_mask;
int err;
+ if (nlmsg_len(nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
ifsm = nlmsg_data(nlh);
if (ifsm->ifindex > 0)
dev = __dev_get_by_index(net, ifsm->ifindex);
@@ -3747,6 +3950,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->seq = net->dev_base_seq;
+ if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
ifsm = nlmsg_data(cb->nlh);
filter_mask = ifsm->filter_mask;
if (!filter_mask)
diff --git a/net/core/scm.c b/net/core/scm.c
index 2696aefdc148..d8820438ba37 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -29,7 +29,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..88a8e429fc3e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -12,6 +12,7 @@
#include <net/secure_seq.h>
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <net/tcp.h>
#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
@@ -40,8 +41,8 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
@@ -58,6 +59,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
md5_transform(hash, secret);
+ *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
return seq_scale(hash[0]);
}
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
@@ -86,8 +88,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#ifdef CONFIG_INET
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
{
u32 hash[MD5_DIGEST_WORDS];
@@ -99,6 +101,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
md5_transform(hash, net_secret);
+ *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
return seq_scale(hash[0]);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..734c71468b01 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,7 +72,7 @@
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
@@ -354,7 +354,7 @@ EXPORT_SYMBOL(build_skb);
struct napi_alloc_cache {
struct page_frag_cache page;
- size_t skb_count;
+ unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
local_irq_save(flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __alloc_page_frag(nc, fragsz, gfp_mask);
+ data = page_frag_alloc(nc, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
}
@@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
local_irq_save(flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
local_irq_restore(flags);
@@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(&nc->page, len, gfp_mask);
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return false;
}
-ssize_t skb_socket_splice(struct sock *sk,
- struct pipe_inode_info *pipe,
- struct splice_pipe_desc *spd)
-{
- int ret;
-
- /* Drop the socket lock, otherwise we have reverse
- * locking dependencies between sk_lock and i_mutex
- * here as compared to sendfile(). We enter here
- * with the socket lock held, and splice_to_pipe() will
- * grab the pipe inode lock. For sendfile() emulation,
- * we call into ->sendpage() with the i_mutex lock held
- * and networking will grab the socket lock.
- */
- release_sock(sk);
- ret = splice_to_pipe(pipe, spd);
- lock_sock(sk);
-
- return ret;
-}
-
/*
* Map data from the skb to a pipe. Should handle both the linear part,
* the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
- unsigned int flags,
- ssize_t (*splice_cb)(struct sock *,
- struct pipe_inode_info *,
- struct splice_pipe_desc *))
+ unsigned int flags)
{
struct partial_page partial[MAX_SKB_FRAGS];
struct page *pages[MAX_SKB_FRAGS];
@@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
if (spd.nr_pages)
- ret = splice_cb(sk, pipe, &spd);
+ ret = splice_to_pipe(pipe, &spd);
return ret;
}
@@ -2445,6 +2421,25 @@ void skb_queue_purge(struct sk_buff_head *list)
EXPORT_SYMBOL(skb_queue_purge);
/**
+ * skb_rbtree_purge - empty a skb rbtree
+ * @root: root of the rbtree to empty
+ *
+ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ * the list and one reference dropped. This function does not take
+ * any lock. Synchronization should be handled by the caller (e.g., TCP
+ * out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+ struct sk_buff *skb, *next;
+
+ rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+ kfree_skb(skb);
+
+ *root = RB_ROOT;
+}
+
+/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
@@ -2661,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
struct skb_frag_struct *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
- BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ if (skb_headlen(skb))
+ return 0;
todo = shiftlen;
from = 0;
@@ -3078,11 +3075,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
- /* GSO partial only requires that we trim off any excess that
- * doesn't fit into an MSS sized block, so take care of that
- * now.
- */
- if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+ if (sg && csum && (mss != GSO_BY_FRAGS)) {
+ if (!(features & NETIF_F_GSO_PARTIAL)) {
+ struct sk_buff *iter;
+
+ if (!list_skb ||
+ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+ goto normal;
+
+ /* Split the buffer at the frag_list pointer.
+ * This is based on the assumption that all
+ * buffers in the chain excluding the last
+ * containing the same amount of data.
+ */
+ skb_walk_frags(head_skb, iter) {
+ if (skb_headlen(iter))
+ goto normal;
+
+ len -= iter->len;
+ }
+ }
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
@@ -3090,6 +3107,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
partial_segs = 0;
}
+normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3281,21 +3299,29 @@ perform_csum_check:
*/
segs->prev = tail;
- /* Update GSO info on first skb in partial sequence. */
if (partial_segs) {
+ struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
+ unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
- type |= SKB_GSO_PARTIAL;
+ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
- skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
- skb_shinfo(segs)->gso_segs = partial_segs;
- skb_shinfo(segs)->gso_type = type;
- SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ for (iter = segs; iter; iter = iter->next) {
+ skb_shinfo(iter)->gso_size = gso_size;
+ skb_shinfo(iter)->gso_segs = partial_segs;
+ skb_shinfo(iter)->gso_type = type;
+ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+ }
+
+ if (tail->len - doffset <= gso_size)
+ skb_shinfo(tail)->gso_size = 0;
+ else if (tail != segs)
+ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
@@ -3688,21 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
struct sk_buff_head *q = &sk->sk_error_queue;
- struct sk_buff *skb, *skb_next;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
unsigned long flags;
- int err = 0;
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q)))
- err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ icmp_next = is_icmp_err_skb(skb_next);
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_err = err;
- if (err)
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
sk->sk_error_report(sk);
return skb;
@@ -3814,10 +3848,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
- if (tsonly)
- skb = alloc_skb(0, GFP_ATOMIC);
- else
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
+ skb = tcp_get_timestamping_opt_stats(sk);
+ else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
+ }
if (!skb)
return;
@@ -4326,7 +4368,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
@@ -4474,17 +4516,24 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
}
EXPORT_SYMBOL(skb_ensure_writable);
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
- unsigned int offset = skb->data - skb_mac_header(skb);
+ int offset = skb->data - skb_mac_header(skb);
int err;
- __skb_push(skb, offset);
+ if (WARN_ONCE(offset,
+ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
if (unlikely(err))
- goto pull;
+ return err;
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
@@ -4501,12 +4550,14 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
skb_set_network_header(skb, ETH_HLEN);
skb_reset_mac_len(skb);
-pull:
- __skb_pull(skb, offset);
return err;
}
+EXPORT_SYMBOL(__skb_vlan_pop);
+/* Pop a vlan tag either from hwaccel or from payload.
+ * Expects skb->data at mac header.
+ */
int skb_vlan_pop(struct sk_buff *skb)
{
u16 vlan_tci;
@@ -4516,9 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb)
if (likely(skb_vlan_tag_present(skb))) {
skb->vlan_tci = 0;
} else {
- if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4526,9 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb)
return err;
}
/* move next vlan tag to hw accel tag */
- if (likely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (likely(!eth_type_vlan(skb->protocol)))
return 0;
vlan_proto = skb->protocol;
@@ -4541,29 +4588,30 @@ int skb_vlan_pop(struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_vlan_pop);
+/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
+ * Expects skb->data at mac header.
+ */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
if (skb_vlan_tag_present(skb)) {
- unsigned int offset = skb->data - skb_mac_header(skb);
+ int offset = skb->data - skb_mac_header(skb);
int err;
- /* __vlan_insert_tag expect skb->data pointing to mac header.
- * So change skb->data before calling it and change back to
- * original position later
- */
- __skb_push(skb, offset);
+ if (WARN_ONCE(offset,
+ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
err = __vlan_insert_tag(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
- if (err) {
- __skb_pull(skb, offset);
+ if (err)
return err;
- }
skb->protocol = skb->vlan_proto;
skb->mac_len += VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
- __skb_pull(skb, offset);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
@@ -4883,3 +4931,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
return clone;
}
EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (skb->data_len) {
+ if (skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+ }
+ /* At this point, skb->truesize might be over estimated,
+ * because skb had a fragment, and fragments do not tell
+ * their truesize.
+ * When we pulled its content into skb->head, fragment
+ * was freed, but __pskb_pull_tail() could not possibly
+ * adjust skb->truesize, not knowing the frag truesize.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index fd7b41edf1ce..4eca27dc5c94 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,7 +118,7 @@
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
@@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
"sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
"sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
"sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
- "sk_lock-AF_MAX"
+ "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
@@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
"slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
"slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
- "slock-AF_MAX"
+ "slock-AF_QIPCRTR", "slock-AF_MAX"
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
@@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
"clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
"clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
- "clock-AF_MAX"
+ "clock-AF_QIPCRTR", "clock-AF_MAX"
};
/*
@@ -453,7 +453,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
EXPORT_SYMBOL(sock_queue_rcv_skb);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
- const int nested, unsigned int trim_cap)
+ const int nested, unsigned int trim_cap, bool refcounted)
{
int rc = NET_RX_SUCCESS;
@@ -487,7 +487,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
bh_unlock_sock(sk);
out:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return rc;
discard_and_relse:
kfree_skb(skb);
@@ -714,7 +715,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -750,7 +751,7 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
break;
case SO_RCVBUFFORCE:
@@ -853,6 +854,13 @@ set_rcvbuf:
sk->sk_tskey = 0;
}
}
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+ ret = -EINVAL;
+ break;
+ }
+
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
@@ -1315,24 +1323,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
#endif
}
-void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
-{
- unsigned long nulls1, nulls2;
-
- nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
- nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
- if (nulls1 > nulls2)
- swap(nulls1, nulls2);
-
- if (nulls1 != 0)
- memset((char *)sk, 0, nulls1);
- memset((char *)sk + nulls1 + sizeof(void *), 0,
- nulls2 - nulls1 - sizeof(void *));
- memset((char *)sk + nulls2 + sizeof(void *), 0,
- size - nulls2 - sizeof(void *));
-}
-EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
-
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
@@ -1344,12 +1334,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
- if (priority & __GFP_ZERO) {
- if (prot->clear_sk)
- prot->clear_sk(sk, prot->obj_size);
- else
- sk_prot_clear_nulls(sk, prot->obj_size);
- }
+ if (priority & __GFP_ZERO)
+ sk_prot_clear_nulls(sk, prot->obj_size);
} else
sk = kmalloc(prot->obj_size, priority);
@@ -1385,6 +1371,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
slab = prot->slab;
cgroup_sk_free(&sk->sk_cgrp_data);
+ mem_cgroup_sk_free(sk);
security_sk_free(sk);
if (slab != NULL)
kmem_cache_free(slab, sk);
@@ -1421,6 +1408,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
+ mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
@@ -1563,10 +1551,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(&newsk->sk_cookie, 0);
+ mem_cgroup_sk_alloc(newsk);
cgroup_sk_alloc(&newsk->sk_cgrp_data);
/*
@@ -1591,9 +1581,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- sock_update_memcg(newsk);
-
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
@@ -2100,37 +2087,31 @@ void __sk_flush_backlog(struct sock *sk)
*/
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc;
- DEFINE_WAIT(wait);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
EXPORT_SYMBOL(sk_wait_data);
/**
- * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * __sk_mem_raise_allocated - increase memory_allocated
* @sk: socket
* @size: memory size to allocate
+ * @amt: pages to allocate
* @kind: allocation type
*
- * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
- * rmem allocation. This function assumes that protocols which have
- * memory_pressure use sk_wmem_queued as write buffer accounting.
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
*/
-int __sk_mem_schedule(struct sock *sk, int size, int kind)
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
- int amt = sk_mem_pages(size);
- long allocated;
-
- sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-
- allocated = sk_memory_allocated_add(sk, amt);
+ long allocated = sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
!mem_cgroup_charge_skmem(sk->sk_memcg, amt))
@@ -2191,9 +2172,6 @@ suppress_allocation:
trace_sock_exceed_buf_limit(sk, prot, allocated);
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-
sk_memory_allocated_sub(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
@@ -2201,18 +2179,40 @@ suppress_allocation:
return 0;
}
+EXPORT_SYMBOL(__sk_mem_raise_allocated);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int ret, amt = sk_mem_pages(size);
+
+ sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+ if (!ret)
+ sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ return ret;
+}
EXPORT_SYMBOL(__sk_mem_schedule);
/**
- * __sk_mem_reclaim - reclaim memory_allocated
+ * __sk_mem_reduce_allocated - reclaim memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of quanta
+ *
+ * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
*/
-void __sk_mem_reclaim(struct sock *sk, int amount)
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
sk_memory_allocated_sub(sk, amount);
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
@@ -2221,6 +2221,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
+EXPORT_SYMBOL(__sk_mem_reduce_allocated);
+
+/**
+ * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ __sk_mem_reduce_allocated(sk, amount);
+}
EXPORT_SYMBOL(__sk_mem_reclaim);
int sk_set_peek_off(struct sock *sk, int val)
@@ -2456,8 +2469,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index e92b759d906c..9a1a352fd1eb 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -129,7 +129,6 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2)
return 0;
}
-EXPORT_SYMBOL(reuseport_add_sock);
static void reuseport_free_rcu(struct rcu_head *head)
{
diff --git a/net/core/stream.c b/net/core/stream.c
index 159516a11b7e..f575bcf64af2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk)
rcu_read_unlock();
}
}
-EXPORT_SYMBOL(sk_stream_write_space);
/**
* sk_stream_wait_connect - Wait for a socket to get into the connected state
@@ -54,8 +53,8 @@ EXPORT_SYMBOL(sk_stream_write_space);
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
int done;
do {
@@ -69,13 +68,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
!((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
- finish_wait(sk_sleep(sk), &wait);
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
@@ -95,16 +94,16 @@ static inline int sk_stream_closing(struct sock *sk)
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
EXPORT_SYMBOL(sk_stream_wait_close);
@@ -120,16 +119,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
long vm_wait = 0;
long current_timeo = *timeo_p;
bool noblock = (*timeo_p ? false : true);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ add_wait_queue(sk_sleep(sk), &wait);
+
while (1) {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p) {
@@ -148,7 +147,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
sk_wait_event(sk, &current_timeo, sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
(sk_stream_memory_free(sk) &&
- !vm_wait));
+ !vm_wait), &wait);
sk->sk_write_pending--;
if (vm_wait) {
@@ -162,7 +161,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..2a46e4009f62 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- if (sock_table)
+ if (sock_table) {
static_key_slow_inc(&rps_needed);
+ static_key_slow_inc(&rfs_needed);
+ }
if (orig_sock_table) {
static_key_slow_dec(&rps_needed);
+ static_key_slow_dec(&rfs_needed);
synchronize_rcu();
vfree(orig_sock_table);
}
diff --git a/net/core/utils.c b/net/core/utils.c
index cf5622b9ccc4..6592d7bbed39 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -31,7 +31,7 @@
#include <net/net_ratelimit.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
/*
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 4f6c1862dfd2..3202d75329b5 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1353,6 +1353,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
dcb_unlock:
spin_unlock_bh(&dcb_lock);
nla_put_failure:
+ err = -EMSGSIZE;
return err;
}
diff --git a/net/dccp/input.c b/net/dccp/input.c
index ba347184bda9..8fedc2d49770 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -606,7 +606,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
skb) < 0)
return 1;
- goto discard;
+ consume_skb(skb);
+ return 0;
}
if (dh->dccph_type == DCCP_PKT_RESET)
goto discard;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 345a3aeb8c7e..d859a5c36e70 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -235,7 +235,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (struct iphdr *)skb->data;
const u8 offset = iph->ihl << 2;
- const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ const struct dccp_hdr *dh;
struct dccp_sock *dp;
struct inet_sock *inet;
const int type = icmp_hdr(skb)->type;
@@ -245,11 +245,13 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
int err;
struct net *net = dev_net(skb->dev);
- if (skb->len < offset + sizeof(*dh) ||
- skb->len < offset + __dccp_basic_hdr_len(dh)) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
- }
+ /* Only need dccph_dport & dccph_sport which are the first
+ * 4 bytes in dccp header.
+ * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us.
+ */
+ BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8);
+ BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8);
+ dh = (struct dccp_hdr *)(skb->data + offset);
sk = __inet_lookup_established(net, &dccp_hashinfo,
iph->daddr, dh->dccph_dport,
@@ -588,13 +590,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (inet_csk_reqsk_queue_is_full(sk))
goto drop;
- /*
- * Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ if (sk_acceptq_is_full(sk))
goto drop;
req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
@@ -698,6 +694,7 @@ int dccp_invalid_packet(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
unsigned int cscov;
+ u8 dccph_doff;
if (skb->pkt_type != PACKET_HOST)
return 1;
@@ -719,18 +716,19 @@ int dccp_invalid_packet(struct sk_buff *skb)
/*
* If P.Data Offset is too small for packet type, drop packet and return
*/
- if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
- DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
+ dccph_doff = dh->dccph_doff;
+ if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+ DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff);
return 1;
}
/*
* If P.Data Offset is too too large for packet, drop packet and return
*/
- if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
- DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
+ if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
+ DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
return 1;
}
-
+ dh = dccp_hdr(skb);
/*
* If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
* has short sequence numbers), drop packet and return
@@ -868,7 +866,7 @@ lookup:
goto discard_and_relse;
nf_reset(skb);
- return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4);
+ return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted);
no_dccp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3828f94b234c..c4e879c02186 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -70,7 +70,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
- const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ const struct dccp_hdr *dh;
struct dccp_sock *dp;
struct ipv6_pinfo *np;
struct sock *sk;
@@ -78,12 +78,13 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
__u64 seq;
struct net *net = dev_net(skb->dev);
- if (skb->len < offset + sizeof(*dh) ||
- skb->len < offset + __dccp_basic_hdr_len(dh)) {
- __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
- return;
- }
+ /* Only need dccph_dport & dccph_sport which are the first
+ * 4 bytes in dccp header.
+ * Our caller (icmpv6_notify()) already pulled 8 bytes for us.
+ */
+ BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8);
+ BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8);
+ dh = (struct dccp_hdr *)(skb->data + offset);
sk = __inet6_lookup_established(net, &dccp_hashinfo,
&hdr->daddr, dh->dccph_dport,
@@ -226,7 +227,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
+ err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -280,7 +281,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+ ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
@@ -325,7 +326,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (inet_csk_reqsk_queue_is_full(sk))
goto drop;
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ if (sk_acceptq_is_full(sk))
goto drop;
req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
@@ -738,7 +739,8 @@ lookup:
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0;
+ return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4,
+ refcounted) ? -1 : 0;
no_dccp_socket:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -956,6 +958,7 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
+ .bind_conflict = inet6_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 41e65804ddf5..9fe25bf63296 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1009,6 +1009,10 @@ void dccp_close(struct sock *sk, long timeout)
__kfree_skb(skb);
}
+ /* If socket has been already reset kill it. */
+ if (sk->sk_state == DCCP_CLOSED)
+ goto adjudge_to_death;
+
if (data_was_unread) {
/* Unread data was tossed, send an appropriate Reset Code */
DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 13d6b1a6e0fc..a90ed67027b0 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1718,7 +1718,7 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
* See if there is data ready to read, sleep if there isn't
*/
for(;;) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk->sk_err)
goto out;
@@ -1749,11 +1749,11 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
goto out;
}
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
+ sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
skb_queue_walk_safe(queue, skb, n) {
@@ -1999,19 +1999,19 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
* size.
*/
if (dn_queue_too_long(scp, queue, flags)) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (flags & MSG_DONTWAIT) {
err = -EWOULDBLOCK;
goto out;
}
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo,
- !dn_queue_too_long(scp, queue, flags));
+ !dn_queue_too_long(scp, queue, flags), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
continue;
}
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index b2c26b081134..8fdd9f492b0e 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -42,7 +42,7 @@
#include <linux/notifier.h>
#include <linux/slab.h>
#include <linux/jiffies.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/net_namespace.h>
#include <net/neighbour.h>
#include <net/dst.h>
@@ -201,7 +201,7 @@ static struct dn_dev_sysctl_table {
.extra1 = &min_t3,
.extra2 = &max_t3
},
- {0}
+ { }
},
};
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index a796fc7cbc35..7af0ba6157a1 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -31,7 +31,7 @@
#include <linux/timer.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/neighbour.h>
#include <net/dst.h>
#include <net/flow.h>
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 1540b506e3e0..232675480756 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -25,7 +25,7 @@
#include <linux/timer.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/route.h> /* RTF_xxx */
#include <net/neighbour.h>
#include <net/netlink.h>
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5325b541c526..6c7da6c29bf0 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -22,7 +22,7 @@
#include <net/dst.h>
#include <net/flow.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/dn.h>
#include <net/dn_dev.h>
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index ff7736f7ff42..96e47c539bee 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA
config NET_DSA_TAG_TRAILER
bool
+config NET_DSA_TAG_QCA
+ bool
+
endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8af4ded70f1c..a3380ed0e0be 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7e68bc6bc853..7899919cd9f0 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -54,6 +54,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
#ifdef CONFIG_NET_DSA_TAG_BRCM
[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
#endif
+#ifdef CONFIG_NET_DSA_TAG_QCA
+ [DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
+#endif
[DSA_TAG_PROTO_NONE] = &none_ops,
};
@@ -61,27 +64,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
static DEFINE_MUTEX(dsa_switch_drivers_mutex);
static LIST_HEAD(dsa_switch_drivers);
-void register_switch_driver(struct dsa_switch_driver *drv)
+void register_switch_driver(struct dsa_switch_ops *ops)
{
mutex_lock(&dsa_switch_drivers_mutex);
- list_add_tail(&drv->list, &dsa_switch_drivers);
+ list_add_tail(&ops->list, &dsa_switch_drivers);
mutex_unlock(&dsa_switch_drivers_mutex);
}
EXPORT_SYMBOL_GPL(register_switch_driver);
-void unregister_switch_driver(struct dsa_switch_driver *drv)
+void unregister_switch_driver(struct dsa_switch_ops *ops)
{
mutex_lock(&dsa_switch_drivers_mutex);
- list_del_init(&drv->list);
+ list_del_init(&ops->list);
mutex_unlock(&dsa_switch_drivers_mutex);
}
EXPORT_SYMBOL_GPL(unregister_switch_driver);
-static struct dsa_switch_driver *
+static struct dsa_switch_ops *
dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
const char **_name, void **priv)
{
- struct dsa_switch_driver *ret;
+ struct dsa_switch_ops *ret;
struct list_head *list;
const char *name;
@@ -90,13 +93,13 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
mutex_lock(&dsa_switch_drivers_mutex);
list_for_each(list, &dsa_switch_drivers) {
- struct dsa_switch_driver *drv;
+ struct dsa_switch_ops *ops;
- drv = list_entry(list, struct dsa_switch_driver, list);
+ ops = list_entry(list, struct dsa_switch_ops, list);
- name = drv->probe(parent, host_dev, sw_addr, priv);
+ name = ops->probe(parent, host_dev, sw_addr, priv);
if (name != NULL) {
- ret = drv;
+ ret = ops;
break;
}
}
@@ -117,7 +120,7 @@ static ssize_t temp1_input_show(struct device *dev,
struct dsa_switch *ds = dev_get_drvdata(dev);
int temp, ret;
- ret = ds->drv->get_temp(ds, &temp);
+ ret = ds->ops->get_temp(ds, &temp);
if (ret < 0)
return ret;
@@ -131,7 +134,7 @@ static ssize_t temp1_max_show(struct device *dev,
struct dsa_switch *ds = dev_get_drvdata(dev);
int temp, ret;
- ret = ds->drv->get_temp_limit(ds, &temp);
+ ret = ds->ops->get_temp_limit(ds, &temp);
if (ret < 0)
return ret;
@@ -149,7 +152,7 @@ static ssize_t temp1_max_store(struct device *dev,
if (ret < 0)
return ret;
- ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
+ ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
if (ret < 0)
return ret;
@@ -164,7 +167,7 @@ static ssize_t temp1_max_alarm_show(struct device *dev,
bool alarm;
int ret;
- ret = ds->drv->get_temp_alarm(ds, &alarm);
+ ret = ds->ops->get_temp_alarm(ds, &alarm);
if (ret < 0)
return ret;
@@ -184,15 +187,15 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
{
struct device *dev = container_of(kobj, struct device, kobj);
struct dsa_switch *ds = dev_get_drvdata(dev);
- struct dsa_switch_driver *drv = ds->drv;
+ struct dsa_switch_ops *ops = ds->ops;
umode_t mode = attr->mode;
if (index == 1) {
- if (!drv->get_temp_limit)
+ if (!ops->get_temp_limit)
mode = 0;
- else if (!drv->set_temp_limit)
+ else if (!ops->set_temp_limit)
mode &= ~S_IWUSR;
- } else if (index == 2 && !drv->get_temp_alarm) {
+ } else if (index == 2 && !ops->get_temp_alarm) {
mode = 0;
}
return mode;
@@ -228,8 +231,10 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
genphy_config_init(phydev);
genphy_read_status(phydev);
- if (ds->drv->adjust_link)
- ds->drv->adjust_link(ds, port, phydev);
+ if (ds->ops->adjust_link)
+ ds->ops->adjust_link(ds, port, phydev);
+
+ put_device(&phydev->mdio.dev);
}
return 0;
@@ -303,7 +308,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
{
- struct dsa_switch_driver *drv = ds->drv;
+ struct dsa_switch_ops *ops = ds->ops;
struct dsa_switch_tree *dst = ds->dst;
struct dsa_chip_data *cd = ds->cd;
bool valid_name_found = false;
@@ -354,7 +359,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
* switch.
*/
if (dst->cpu_switch == index) {
- dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol);
+ enum dsa_tag_protocol tag_protocol;
+
+ tag_protocol = ops->get_tag_protocol(ds);
+ dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
if (IS_ERR(dst->tag_ops)) {
ret = PTR_ERR(dst->tag_ops);
goto out;
@@ -368,15 +376,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
/*
* Do basic register setup.
*/
- ret = drv->setup(ds);
+ ret = ops->setup(ds);
if (ret < 0)
goto out;
- ret = drv->set_addr(ds, dst->master_netdev->dev_addr);
- if (ret < 0)
- goto out;
+ if (ops->set_addr) {
+ ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
+ if (ret < 0)
+ goto out;
+ }
- if (!ds->slave_mii_bus && drv->phy_read) {
+ if (!ds->slave_mii_bus && ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(parent);
if (!ds->slave_mii_bus) {
ret = -ENOMEM;
@@ -423,7 +433,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
* register with hardware monitoring subsystem.
* Treat registration error as non-fatal and ignore it.
*/
- if (drv->get_temp) {
+ if (ops->get_temp) {
const char *netname = netdev_name(dst->master_netdev);
char hname[IFNAMSIZ + 1];
int i, j;
@@ -454,7 +464,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
struct device *parent, struct device *host_dev)
{
struct dsa_chip_data *cd = dst->pd->chip + index;
- struct dsa_switch_driver *drv;
+ struct dsa_switch_ops *ops;
struct dsa_switch *ds;
int ret;
const char *name;
@@ -463,8 +473,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
/*
* Probe for switch model.
*/
- drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv);
- if (drv == NULL) {
+ ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv);
+ if (!ops) {
netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n",
index);
return ERR_PTR(-EINVAL);
@@ -483,7 +493,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
ds->dst = dst;
ds->index = index;
ds->cd = cd;
- ds->drv = drv;
+ ds->ops = ops;
ds->priv = priv;
ds->dev = parent;
@@ -496,15 +506,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
void dsa_cpu_dsa_destroy(struct device_node *port_dn)
{
- struct phy_device *phydev;
-
- if (of_phy_is_fixed_link(port_dn)) {
- phydev = of_phy_find_device(port_dn);
- if (phydev) {
- phy_device_free(phydev);
- fixed_phy_unregister(phydev);
- }
- }
+ if (of_phy_is_fixed_link(port_dn))
+ of_phy_deregister_fixed_link(port_dn);
}
static void dsa_switch_destroy(struct dsa_switch *ds)
@@ -538,12 +541,12 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
ds->dsa_port_mask |= ~(1 << port);
}
- if (ds->slave_mii_bus && ds->drv->phy_read)
+ if (ds->slave_mii_bus && ds->ops->phy_read)
mdiobus_unregister(ds->slave_mii_bus);
}
#ifdef CONFIG_PM_SLEEP
-static int dsa_switch_suspend(struct dsa_switch *ds)
+int dsa_switch_suspend(struct dsa_switch *ds)
{
int i, ret = 0;
@@ -557,18 +560,19 @@ static int dsa_switch_suspend(struct dsa_switch *ds)
return ret;
}
- if (ds->drv->suspend)
- ret = ds->drv->suspend(ds);
+ if (ds->ops->suspend)
+ ret = ds->ops->suspend(ds);
return ret;
}
+EXPORT_SYMBOL_GPL(dsa_switch_suspend);
-static int dsa_switch_resume(struct dsa_switch *ds)
+int dsa_switch_resume(struct dsa_switch *ds)
{
int i, ret = 0;
- if (ds->drv->resume)
- ret = ds->drv->resume(ds);
+ if (ds->ops->resume)
+ ret = ds->ops->resume(ds);
if (ret)
return ret;
@@ -585,6 +589,7 @@ static int dsa_switch_resume(struct dsa_switch *ds)
return 0;
}
+EXPORT_SYMBOL_GPL(dsa_switch_resume);
#endif
/* platform driver init and cleanup *****************************************/
@@ -1086,7 +1091,6 @@ static int dsa_resume(struct device *d)
static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
static const struct of_device_id dsa_of_match_table[] = {
- { .compatible = "brcm,bcm7445-switch-v4.0" },
{ .compatible = "marvell,dsa", },
{}
};
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index f30bad9678f0..0f99297b2fb3 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -28,8 +28,10 @@ static struct dsa_switch_tree *dsa_get_dst(u32 tree)
struct dsa_switch_tree *dst;
list_for_each_entry(dst, &dsa_switch_trees, list)
- if (dst->tree == tree)
+ if (dst->tree == tree) {
+ kref_get(&dst->refcount);
return dst;
+ }
return NULL;
}
@@ -271,6 +273,7 @@ static int dsa_user_port_apply(struct device_node *port, u32 index,
if (err) {
dev_warn(ds->dev, "Failed to create slave %d: %d\n",
index, err);
+ ds->ports[index].netdev = NULL;
return err;
}
@@ -294,25 +297,23 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
int err;
/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
- * driver and before drv->setup() has run, since the switch drivers and
+ * driver and before ops->setup() has run, since the switch drivers and
* the slave MDIO bus driver rely on these values for probing PHY
* devices or not
*/
ds->phys_mii_mask = ds->enabled_port_mask;
- err = ds->drv->setup(ds);
- if (err < 0)
- return err;
-
- err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr);
+ err = ds->ops->setup(ds);
if (err < 0)
return err;
- err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr);
- if (err < 0)
- return err;
+ if (ds->ops->set_addr) {
+ err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
+ if (err < 0)
+ return err;
+ }
- if (!ds->slave_mii_bus && ds->drv->phy_read) {
+ if (!ds->slave_mii_bus && ds->ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
if (!ds->slave_mii_bus)
return -ENOMEM;
@@ -374,7 +375,7 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
dsa_user_port_unapply(port, index, ds);
}
- if (ds->slave_mii_bus && ds->drv->phy_read)
+ if (ds->slave_mii_bus && ds->ops->phy_read)
mdiobus_unregister(ds->slave_mii_bus);
}
@@ -394,9 +395,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
return err;
}
- err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
- if (err)
- return err;
+ if (dst->ds[0]) {
+ err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
+ if (err)
+ return err;
+ }
/* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
@@ -433,7 +436,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
dsa_ds_unapply(dst, ds);
}
- dsa_cpu_port_ethtool_restore(dst->ds[0]);
+ if (dst->ds[0])
+ dsa_cpu_port_ethtool_restore(dst->ds[0]);
pr_info("DSA: tree %d unapplied\n", dst->tree);
dst->applied = false;
@@ -443,6 +447,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
struct dsa_switch_tree *dst,
struct dsa_switch *ds)
{
+ enum dsa_tag_protocol tag_protocol;
struct net_device *ethernet_dev;
struct device_node *ethernet;
@@ -465,7 +470,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
dst->cpu_port = index;
}
- dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol);
+ tag_protocol = ds->ops->get_tag_protocol(ds);
+ dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
if (IS_ERR(dst->tag_ops)) {
dev_warn(ds->dev, "No tagger for this switch\n");
return PTR_ERR(dst->tag_ops);
@@ -541,7 +547,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
ds->ports[reg].dn = port;
- /* Initialize enabled_port_mask now for drv->setup()
+ /* Initialize enabled_port_mask now for ops->setup()
* to have access to a correct value, just like what
* net/dsa/dsa.c::dsa_switch_setup_one does.
*/
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 00077a9c97f4..6cfd7388834e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops;
/* tag_brcm.c */
extern const struct dsa_device_ops brcm_netdev_ops;
+/* tag_qca.c */
+extern const struct dsa_device_ops qca_netdev_ops;
#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index fc9196745225..7d4596110851 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -28,7 +28,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
struct dsa_switch *ds = bus->priv;
if (ds->phys_mii_mask & (1 << addr))
- return ds->drv->phy_read(ds, addr, reg);
+ return ds->ops->phy_read(ds, addr, reg);
return 0xffff;
}
@@ -38,7 +38,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
struct dsa_switch *ds = bus->priv;
if (ds->phys_mii_mask & (1 << addr))
- return ds->drv->phy_write(ds, addr, reg, val);
+ return ds->ops->phy_write(ds, addr, reg, val);
return 0;
}
@@ -69,6 +69,30 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
return !!p->bridge_dev;
}
+static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+{
+ struct dsa_port *dp = &ds->ports[port];
+
+ if (ds->ops->port_stp_state_set)
+ ds->ops->port_stp_state_set(ds, port, state);
+
+ if (ds->ops->port_fast_age) {
+ /* Fast age FDB entries or flush appropriate forwarding database
+ * for the given port, if we are moving it from Learning or
+ * Forwarding state, to Disabled or Blocking or Listening state.
+ */
+
+ if ((dp->stp_state == BR_STATE_LEARNING ||
+ dp->stp_state == BR_STATE_FORWARDING) &&
+ (state == BR_STATE_DISABLED ||
+ state == BR_STATE_BLOCKING ||
+ state == BR_STATE_LISTENING))
+ ds->ops->port_fast_age(ds, port);
+ }
+
+ dp->stp_state = state;
+}
+
static int dsa_slave_open(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
@@ -98,14 +122,13 @@ static int dsa_slave_open(struct net_device *dev)
goto clear_allmulti;
}
- if (ds->drv->port_enable) {
- err = ds->drv->port_enable(ds, p->port, p->phy);
+ if (ds->ops->port_enable) {
+ err = ds->ops->port_enable(ds, p->port, p->phy);
if (err)
goto clear_promisc;
}
- if (ds->drv->port_stp_state_set)
- ds->drv->port_stp_state_set(ds, p->port, stp_state);
+ dsa_port_set_stp_state(ds, p->port, stp_state);
if (p->phy)
phy_start(p->phy);
@@ -144,11 +167,10 @@ static int dsa_slave_close(struct net_device *dev)
if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
dev_uc_del(master, dev->dev_addr);
- if (ds->drv->port_disable)
- ds->drv->port_disable(ds, p->port, p->phy);
+ if (ds->ops->port_disable)
+ ds->ops->port_disable(ds, p->port, p->phy);
- if (ds->drv->port_stp_state_set)
- ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED);
+ dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
return 0;
}
@@ -209,13 +231,13 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
struct dsa_switch *ds = p->parent;
if (switchdev_trans_ph_prepare(trans)) {
- if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add)
+ if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
return -EOPNOTSUPP;
- return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans);
+ return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans);
}
- ds->drv->port_vlan_add(ds, p->port, vlan, trans);
+ ds->ops->port_vlan_add(ds, p->port, vlan, trans);
return 0;
}
@@ -226,10 +248,10 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (!ds->drv->port_vlan_del)
+ if (!ds->ops->port_vlan_del)
return -EOPNOTSUPP;
- return ds->drv->port_vlan_del(ds, p->port, vlan);
+ return ds->ops->port_vlan_del(ds, p->port, vlan);
}
static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -239,8 +261,8 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->port_vlan_dump)
- return ds->drv->port_vlan_dump(ds, p->port, vlan, cb);
+ if (ds->ops->port_vlan_dump)
+ return ds->ops->port_vlan_dump(ds, p->port, vlan, cb);
return -EOPNOTSUPP;
}
@@ -253,13 +275,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
struct dsa_switch *ds = p->parent;
if (switchdev_trans_ph_prepare(trans)) {
- if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add)
+ if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
return -EOPNOTSUPP;
- return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans);
+ return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans);
}
- ds->drv->port_fdb_add(ds, p->port, fdb, trans);
+ ds->ops->port_fdb_add(ds, p->port, fdb, trans);
return 0;
}
@@ -271,8 +293,8 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
struct dsa_switch *ds = p->parent;
int ret = -EOPNOTSUPP;
- if (ds->drv->port_fdb_del)
- ret = ds->drv->port_fdb_del(ds, p->port, fdb);
+ if (ds->ops->port_fdb_del)
+ ret = ds->ops->port_fdb_del(ds, p->port, fdb);
return ret;
}
@@ -284,8 +306,52 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->port_fdb_dump)
- return ds->drv->port_fdb_dump(ds, p->port, fdb, cb);
+ if (ds->ops->port_fdb_dump)
+ return ds->ops->port_fdb_dump(ds, p->port, fdb, cb);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_slave_port_mdb_add(struct net_device *dev,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct switchdev_trans *trans)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (switchdev_trans_ph_prepare(trans)) {
+ if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans);
+ }
+
+ ds->ops->port_mdb_add(ds, p->port, mdb, trans);
+
+ return 0;
+}
+
+static int dsa_slave_port_mdb_del(struct net_device *dev,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->ops->port_mdb_del)
+ return ds->ops->port_mdb_del(ds, p->port, mdb);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_slave_port_mdb_dump(struct net_device *dev,
+ struct switchdev_obj_port_mdb *mdb,
+ switchdev_obj_dump_cb_t *cb)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->ops->port_mdb_dump)
+ return ds->ops->port_mdb_dump(ds, p->port, mdb, cb);
return -EOPNOTSUPP;
}
@@ -308,9 +374,9 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
struct dsa_switch *ds = p->parent;
if (switchdev_trans_ph_prepare(trans))
- return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP;
+ return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
- ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state);
+ dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
return 0;
}
@@ -326,8 +392,8 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
if (switchdev_trans_ph_prepare(trans))
return 0;
- if (ds->drv->port_vlan_filtering)
- return ds->drv->port_vlan_filtering(ds, p->port,
+ if (ds->ops->port_vlan_filtering)
+ return ds->ops->port_vlan_filtering(ds, p->port,
attr->u.vlan_filtering);
return 0;
@@ -365,8 +431,8 @@ static int dsa_slave_ageing_time(struct net_device *dev,
ds->ports[p->port].ageing_time = ageing_time;
ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
- if (ds->drv->set_ageing_time)
- return ds->drv->set_ageing_time(ds, ageing_time);
+ if (ds->ops->set_ageing_time)
+ return ds->ops->set_ageing_time(ds, ageing_time);
return 0;
}
@@ -412,6 +478,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
SWITCHDEV_OBJ_PORT_FDB(obj),
trans);
break;
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj),
+ trans);
+ break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
err = dsa_slave_port_vlan_add(dev,
SWITCHDEV_OBJ_PORT_VLAN(obj),
@@ -435,6 +505,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
err = dsa_slave_port_fdb_del(dev,
SWITCHDEV_OBJ_PORT_FDB(obj));
break;
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
err = dsa_slave_port_vlan_del(dev,
SWITCHDEV_OBJ_PORT_VLAN(obj));
@@ -459,6 +532,10 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
SWITCHDEV_OBJ_PORT_FDB(obj),
cb);
break;
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj),
+ cb);
+ break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
err = dsa_slave_port_vlan_dump(dev,
SWITCHDEV_OBJ_PORT_VLAN(obj),
@@ -481,8 +558,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
p->bridge_dev = br;
- if (ds->drv->port_bridge_join)
- ret = ds->drv->port_bridge_join(ds, p->port, br);
+ if (ds->ops->port_bridge_join)
+ ret = ds->ops->port_bridge_join(ds, p->port, br);
return ret == -EOPNOTSUPP ? 0 : ret;
}
@@ -493,16 +570,15 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
struct dsa_switch *ds = p->parent;
- if (ds->drv->port_bridge_leave)
- ds->drv->port_bridge_leave(ds, p->port);
+ if (ds->ops->port_bridge_leave)
+ ds->ops->port_bridge_leave(ds, p->port);
p->bridge_dev = NULL;
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
* so allow it to be in BR_STATE_FORWARDING to be kept functional
*/
- if (ds->drv->port_stp_state_set)
- ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING);
+ dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
}
static int dsa_slave_port_attr_get(struct net_device *dev,
@@ -565,7 +641,8 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
/* ethtool operations *******************************************************/
static int
-dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+dsa_slave_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
{
struct dsa_slave_priv *p = netdev_priv(dev);
int err;
@@ -574,19 +651,20 @@ dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
if (p->phy != NULL) {
err = phy_read_status(p->phy);
if (err == 0)
- err = phy_ethtool_gset(p->phy, cmd);
+ err = phy_ethtool_ksettings_get(p->phy, cmd);
}
return err;
}
static int
-dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+dsa_slave_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
{
struct dsa_slave_priv *p = netdev_priv(dev);
if (p->phy != NULL)
- return phy_ethtool_sset(p->phy, cmd);
+ return phy_ethtool_ksettings_set(p->phy, cmd);
return -EOPNOTSUPP;
}
@@ -605,8 +683,8 @@ static int dsa_slave_get_regs_len(struct net_device *dev)
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->get_regs_len)
- return ds->drv->get_regs_len(ds, p->port);
+ if (ds->ops->get_regs_len)
+ return ds->ops->get_regs_len(ds, p->port);
return -EOPNOTSUPP;
}
@@ -617,8 +695,8 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->get_regs)
- ds->drv->get_regs(ds, p->port, regs, _p);
+ if (ds->ops->get_regs)
+ ds->ops->get_regs(ds, p->port, regs, _p);
}
static int dsa_slave_nway_reset(struct net_device *dev)
@@ -651,8 +729,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev)
if (ds->cd && ds->cd->eeprom_len)
return ds->cd->eeprom_len;
- if (ds->drv->get_eeprom_len)
- return ds->drv->get_eeprom_len(ds);
+ if (ds->ops->get_eeprom_len)
+ return ds->ops->get_eeprom_len(ds);
return 0;
}
@@ -663,8 +741,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->get_eeprom)
- return ds->drv->get_eeprom(ds, eeprom, data);
+ if (ds->ops->get_eeprom)
+ return ds->ops->get_eeprom(ds, eeprom, data);
return -EOPNOTSUPP;
}
@@ -675,8 +753,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->set_eeprom)
- return ds->drv->set_eeprom(ds, eeprom, data);
+ if (ds->ops->set_eeprom)
+ return ds->ops->set_eeprom(ds, eeprom, data);
return -EOPNOTSUPP;
}
@@ -694,8 +772,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
strncpy(data + len, "tx_bytes", len);
strncpy(data + 2 * len, "rx_packets", len);
strncpy(data + 3 * len, "rx_bytes", len);
- if (ds->drv->get_strings != NULL)
- ds->drv->get_strings(ds, p->port, data + 4 * len);
+ if (ds->ops->get_strings)
+ ds->ops->get_strings(ds, p->port, data + 4 * len);
}
}
@@ -714,8 +792,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data);
}
- if (ds->drv->get_ethtool_stats)
- ds->drv->get_ethtool_stats(ds, cpu_port, data + count);
+ if (ds->ops->get_ethtool_stats)
+ ds->ops->get_ethtool_stats(ds, cpu_port, data + count);
}
static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
@@ -727,8 +805,8 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
if (dst->master_ethtool_ops.get_sset_count)
count += dst->master_ethtool_ops.get_sset_count(dev, sset);
- if (sset == ETH_SS_STATS && ds->drv->get_sset_count)
- count += ds->drv->get_sset_count(ds);
+ if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
+ count += ds->ops->get_sset_count(ds);
return count;
}
@@ -755,14 +833,14 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
dst->master_ethtool_ops.get_strings(dev, stringset, data);
}
- if (stringset == ETH_SS_STATS && ds->drv->get_strings) {
+ if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
ndata = data + mcount * len;
/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
* the output after to prepend our CPU port prefix we
* constructed earlier
*/
- ds->drv->get_strings(ds, cpu_port, ndata);
- count = ds->drv->get_sset_count(ds);
+ ds->ops->get_strings(ds, cpu_port, ndata);
+ count = ds->ops->get_sset_count(ds);
for (i = 0; i < count; i++) {
memmove(ndata + (i * len + sizeof(pfx)),
ndata + i * len, len - sizeof(pfx));
@@ -782,8 +860,8 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
data[1] = dev->stats.tx_bytes;
data[2] = dev->stats.rx_packets;
data[3] = dev->stats.rx_bytes;
- if (ds->drv->get_ethtool_stats != NULL)
- ds->drv->get_ethtool_stats(ds, p->port, data + 4);
+ if (ds->ops->get_ethtool_stats)
+ ds->ops->get_ethtool_stats(ds, p->port, data + 4);
}
static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
@@ -795,8 +873,8 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
int count;
count = 4;
- if (ds->drv->get_sset_count != NULL)
- count += ds->drv->get_sset_count(ds);
+ if (ds->ops->get_sset_count)
+ count += ds->ops->get_sset_count(ds);
return count;
}
@@ -809,8 +887,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- if (ds->drv->get_wol)
- ds->drv->get_wol(ds, p->port, w);
+ if (ds->ops->get_wol)
+ ds->ops->get_wol(ds, p->port, w);
}
static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
@@ -819,8 +897,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
struct dsa_switch *ds = p->parent;
int ret = -EOPNOTSUPP;
- if (ds->drv->set_wol)
- ret = ds->drv->set_wol(ds, p->port, w);
+ if (ds->ops->set_wol)
+ ret = ds->ops->set_wol(ds, p->port, w);
return ret;
}
@@ -831,10 +909,10 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
struct dsa_switch *ds = p->parent;
int ret;
- if (!ds->drv->set_eee)
+ if (!ds->ops->set_eee)
return -EOPNOTSUPP;
- ret = ds->drv->set_eee(ds, p->port, p->phy, e);
+ ret = ds->ops->set_eee(ds, p->port, p->phy, e);
if (ret)
return ret;
@@ -850,10 +928,10 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
struct dsa_switch *ds = p->parent;
int ret;
- if (!ds->drv->get_eee)
+ if (!ds->ops->get_eee)
return -EOPNOTSUPP;
- ret = ds->drv->get_eee(ds, p->port, e);
+ ret = ds->ops->get_eee(ds, p->port, e);
if (ret)
return ret;
@@ -914,8 +992,6 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
}
static const struct ethtool_ops dsa_slave_ethtool_ops = {
- .get_settings = dsa_slave_get_settings,
- .set_settings = dsa_slave_set_settings,
.get_drvinfo = dsa_slave_get_drvinfo,
.get_regs_len = dsa_slave_get_regs_len,
.get_regs = dsa_slave_get_regs,
@@ -931,6 +1007,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_wol = dsa_slave_get_wol,
.set_eee = dsa_slave_set_eee,
.get_eee = dsa_slave_get_eee,
+ .get_link_ksettings = dsa_slave_get_link_ksettings,
+ .set_link_ksettings = dsa_slave_set_link_ksettings,
};
static const struct net_device_ops dsa_slave_netdev_ops = {
@@ -988,8 +1066,8 @@ static void dsa_slave_adjust_link(struct net_device *dev)
p->old_pause = p->phy->pause;
}
- if (ds->drv->adjust_link && status_changed)
- ds->drv->adjust_link(ds, p->port, p->phy);
+ if (ds->ops->adjust_link && status_changed)
+ ds->ops->adjust_link(ds, p->port, p->phy);
if (status_changed)
phy_print_status(p->phy);
@@ -1004,8 +1082,8 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
if (dev) {
p = netdev_priv(dev);
ds = p->parent;
- if (ds->drv->fixed_link_update)
- ds->drv->fixed_link_update(ds, p->port, status);
+ if (ds->ops->fixed_link_update)
+ ds->ops->fixed_link_update(ds, p->port, status);
}
return 0;
@@ -1027,10 +1105,8 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
/* Use already configured phy mode */
if (p->phy_interface == PHY_INTERFACE_MODE_NA)
p->phy_interface = p->phy->interface;
- phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
- p->phy_interface);
-
- return 0;
+ return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
+ p->phy_interface);
}
static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
@@ -1049,7 +1125,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
p->phy_interface = mode;
phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
- if (of_phy_is_fixed_link(port_dn)) {
+ if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
/* In the case of a fixed PHY, the DT node associated
* to the fixed PHY is the Port DT node
*/
@@ -1059,11 +1135,11 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
return ret;
}
phy_is_fixed = true;
- phy_dn = port_dn;
+ phy_dn = of_node_get(port_dn);
}
- if (ds->drv->get_phy_flags)
- phy_flags = ds->drv->get_phy_flags(ds, p->port);
+ if (ds->ops->get_phy_flags)
+ phy_flags = ds->ops->get_phy_flags(ds, p->port);
if (phy_dn) {
int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
@@ -1078,6 +1154,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
ret = dsa_slave_phy_connect(p, slave_dev, phy_id);
if (ret) {
netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret);
+ of_node_put(phy_dn);
return ret;
}
} else {
@@ -1086,6 +1163,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
phy_flags,
p->phy_interface);
}
+
+ of_node_put(phy_dn);
}
if (p->phy && phy_is_fixed)
@@ -1098,6 +1177,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
ret = dsa_slave_phy_connect(p, slave_dev, p->port);
if (ret) {
netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
+ if (phy_is_fixed)
+ of_phy_deregister_fixed_link(port_dn);
return ret;
}
}
@@ -1120,6 +1201,8 @@ int dsa_slave_suspend(struct net_device *slave_dev)
{
struct dsa_slave_priv *p = netdev_priv(slave_dev);
+ netif_device_detach(slave_dev);
+
if (p->phy) {
phy_stop(p->phy);
p->old_pause = -1;
@@ -1169,6 +1252,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
+ slave_dev->min_mtu = 0;
+ slave_dev->max_mtu = ETH_MAX_MTU;
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
@@ -1213,10 +1298,18 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
void dsa_slave_destroy(struct net_device *slave_dev)
{
struct dsa_slave_priv *p = netdev_priv(slave_dev);
+ struct dsa_switch *ds = p->parent;
+ struct device_node *port_dn;
+
+ port_dn = ds->ports[p->port].dn;
netif_carrier_off(slave_dev);
- if (p->phy)
+ if (p->phy) {
phy_disconnect(p->phy);
+
+ if (of_phy_is_fixed_link(port_dn))
+ of_phy_deregister_fixed_link(port_dn);
+ }
unregister_netdev(slave_dev);
free_netdev(slave_dev);
}
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 000000000000..0c90cacee7aa
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/etherdevice.h>
+#include "dsa_priv.h"
+
+#define QCA_HDR_LEN 2
+#define QCA_HDR_VERSION 0x2
+
+#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14)
+#define QCA_HDR_RECV_VERSION_S 14
+#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11)
+#define QCA_HDR_RECV_PRIORITY_S 11
+#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6)
+#define QCA_HDR_RECV_TYPE_S 6
+#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3)
+#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
+
+#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14)
+#define QCA_HDR_XMIT_VERSION_S 14
+#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11)
+#define QCA_HDR_XMIT_PRIORITY_S 11
+#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8)
+#define QCA_HDR_XMIT_CONTROL_S 8
+#define QCA_HDR_XMIT_FROM_CPU BIT(7)
+#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0)
+
+static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ u16 *phdr, hdr;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ if (skb_cow_head(skb, 0) < 0)
+ goto out_free;
+
+ skb_push(skb, QCA_HDR_LEN);
+
+ memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
+ phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
+
+ /* Set the version field, and set destination port information */
+ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
+ QCA_HDR_XMIT_FROM_CPU |
+ BIT(p->port);
+
+ *phdr = htons(hdr);
+
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
+static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
+ u8 ver;
+ int port;
+ __be16 *phdr, hdr;
+
+ if (unlikely(!dst))
+ goto out_drop;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+ goto out_drop;
+
+ /* The QCA header is added by the switch between src addr and Ethertype
+ * At this point, skb->data points to ethertype so header should be
+ * right before
+ */
+ phdr = (__be16 *)(skb->data - 2);
+ hdr = ntohs(*phdr);
+
+ /* Make sure the version is correct */
+ ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+ if (unlikely(ver != QCA_HDR_VERSION))
+ goto out_drop;
+
+ /* Remove QCA tag and recalculate checksum */
+ skb_pull_rcsum(skb, QCA_HDR_LEN);
+ memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
+ ETH_HLEN - QCA_HDR_LEN);
+
+ /* This protocol doesn't support cascading multiple switches so it's
+ * safe to assume the switch is first in the tree
+ */
+ ds = dst->ds[0];
+ if (!ds)
+ goto out_drop;
+
+ /* Get source port information */
+ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+ if (!ds->ports[port].netdev)
+ goto out_drop;
+
+ /* Update skb & forward the frame accordingly */
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ds->ports[port].netdev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+out_drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+const struct dsa_device_ops qca_netdev_ops = {
+ .xmit = qca_tag_xmit,
+ .rcv = qca_tag_rcv,
+};
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 66dff5e3d772..516c87e75de7 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,6 +62,7 @@
#include <net/dsa.h>
#include <net/flow_dissector.h>
#include <linux/uaccess.h>
+#include <net/pkt_sched.h>
__setup("ether=", netdev_boot_setup);
@@ -322,8 +323,7 @@ EXPORT_SYMBOL(eth_mac_addr);
*/
int eth_change_mtu(struct net_device *dev, int new_mtu)
{
- if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
- return -EINVAL;
+ netdev_warn(dev, "%s is deprecated\n", __func__);
dev->mtu = new_mtu;
return 0;
}
@@ -356,9 +356,12 @@ void ether_setup(struct net_device *dev)
dev->header_ops = &eth_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
+ dev->min_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
- dev->tx_queue_len = 1000; /* Ethernet wants good queues */
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->priv_flags |= IFF_TX_SKB_SHARING;
@@ -439,7 +442,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
skb_gro_pull(skb, sizeof(*eh));
skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
- pp = ptype->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 16737cd8dae8..fc65b145f6e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -398,6 +398,7 @@ void hsr_dev_setup(struct net_device *dev)
random_ether_addr(dev->dev_addr);
ether_setup(dev);
+ dev->min_mtu = 0;
dev->header_ops = &hsr_header_ops;
dev->netdev_ops = &hsr_device_ops;
SET_NETDEV_DEVTYPE(dev, &hsr_type);
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 5ee1d43f1310..4ebe2aa3e7d3 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -300,10 +300,6 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb,
struct hsr_frame_info *frame)
{
- struct net_device *master_dev;
-
- master_dev = hsr_port_get_hsr(hsr, HSR_PT_MASTER)->dev;
-
if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) {
frame->is_local_exclusive = true;
skb->pkt_type = PACKET_HOST;
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index d4d1617f43a8..1ab30e7d3f99 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -131,13 +131,7 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
[HSR_A_IF2_SEQ] = { .type = NLA_U16 },
};
-static struct genl_family hsr_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "HSR",
- .version = 1,
- .maxattr = HSR_A_MAX,
-};
+static struct genl_family hsr_genl_family;
static const struct genl_multicast_group hsr_mcgrps[] = {
{ .name = "hsr-network", },
@@ -467,6 +461,18 @@ static const struct genl_ops hsr_ops[] = {
},
};
+static struct genl_family hsr_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "HSR",
+ .version = 1,
+ .maxattr = HSR_A_MAX,
+ .module = THIS_MODULE,
+ .ops = hsr_ops,
+ .n_ops = ARRAY_SIZE(hsr_ops),
+ .mcgrps = hsr_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
+};
+
int __init hsr_netlink_init(void)
{
int rc;
@@ -475,8 +481,7 @@ int __init hsr_netlink_init(void)
if (rc)
goto fail_rtnl_link_register;
- rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops,
- hsr_mcgrps);
+ rc = genl_register_family(&hsr_genl_family);
if (rc)
goto fail_genl_register_family;
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index 5ac778962e4e..ac7c96b73ad5 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -7,7 +7,7 @@
#include <net/inet_frag.h>
#include <net/6lowpan.h>
-typedef unsigned __bitwise__ lowpan_rx_result;
+typedef unsigned __bitwise lowpan_rx_result;
#define RX_CONTINUE ((__force lowpan_rx_result) 0u)
#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u)
#define RX_DROP ((__force lowpan_rx_result) 2u)
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 4adfd4d5471b..9b92ade687a3 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -7,5 +7,3 @@ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
ieee802154_socket-y := socket.o
CFLAGS_trace.o := -I$(src)
-
-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index c8133c07ceee..6bde9e5a5503 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -28,14 +28,6 @@
static unsigned int ieee802154_seq_num;
static DEFINE_SPINLOCK(ieee802154_seq_lock);
-struct genl_family nl802154_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = IEEE802154_NL_NAME,
- .version = 1,
- .maxattr = IEEE802154_ATTR_MAX,
-};
-
/* Requests to userspace */
struct sk_buff *ieee802154_nl_create(int flags, u8 req)
{
@@ -139,11 +131,21 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
};
+struct genl_family nl802154_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = IEEE802154_NL_NAME,
+ .version = 1,
+ .maxattr = IEEE802154_ATTR_MAX,
+ .module = THIS_MODULE,
+ .ops = ieee8021154_ops,
+ .n_ops = ARRAY_SIZE(ieee8021154_ops),
+ .mcgrps = ieee802154_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps),
+};
+
int __init ieee802154_nl_init(void)
{
- return genl_register_family_with_ops_groups(&nl802154_family,
- ieee8021154_ops,
- ieee802154_mcgrps);
+ return genl_register_family(&nl802154_family);
}
void ieee802154_nl_exit(void)
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 77d73014bde3..dc2960be51e0 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -286,9 +286,12 @@ int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info)
if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
return -EINVAL; /* name should be null-terminated */
+ rc = -ENODEV;
dev = dev_get_by_name(genl_info_net(info), name);
if (!dev)
- return -ENODEV;
+ return rc;
+ if (dev->type != ARPHRD_IEEE802154)
+ goto out;
phy = dev->ieee802154_ptr->wpan_phy;
BUG_ON(!phy);
@@ -342,6 +345,7 @@ nla_put_failure:
nlmsg_free(msg);
out_dev:
wpan_phy_put(phy);
+out:
if (dev)
dev_put(dev);
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index d90a4ed5b8a0..fc60cd061f39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -26,23 +26,8 @@
#include "rdev-ops.h"
#include "core.h"
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-
/* the netlink family */
-static struct genl_family nl802154_fam = {
- .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
- .name = NL802154_GENL_NAME, /* have users key off the name instead */
- .hdrsize = 0, /* no private header */
- .version = 1, /* no particular meaning now */
- .maxattr = NL802154_ATTR_MAX,
- .netnsok = true,
- .pre_doit = nl802154_pre_doit,
- .post_doit = nl802154_post_doit,
-};
+static struct genl_family nl802154_fam;
/* multicast groups */
enum nl802154_multicast_groups {
@@ -263,13 +248,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
if (!cb->args[0]) {
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
- nl802154_fam.attrbuf, nl802154_fam.maxattr,
+ genl_family_attrbuf(&nl802154_fam),
+ nl802154_fam.maxattr,
nl802154_policy);
if (err)
goto out_unlock;
*wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
- nl802154_fam.attrbuf);
+ genl_family_attrbuf(&nl802154_fam));
if (IS_ERR(*wpan_dev)) {
err = PTR_ERR(*wpan_dev);
goto out_unlock;
@@ -575,7 +561,7 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl802154_dump_wpan_phy_state *state)
{
- struct nlattr **tb = nl802154_fam.attrbuf;
+ struct nlattr **tb = genl_family_attrbuf(&nl802154_fam);
int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
tb, nl802154_fam.maxattr, nl802154_policy);
@@ -2476,11 +2462,25 @@ static const struct genl_ops nl802154_ops[] = {
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};
+static struct genl_family nl802154_fam __ro_after_init = {
+ .name = NL802154_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL802154_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = nl802154_pre_doit,
+ .post_doit = nl802154_post_doit,
+ .module = THIS_MODULE,
+ .ops = nl802154_ops,
+ .n_ops = ARRAY_SIZE(nl802154_ops),
+ .mcgrps = nl802154_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
+};
+
/* initialisation/exit functions */
-int nl802154_init(void)
+int __init nl802154_init(void)
{
- return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops,
- nl802154_mcgrps);
+ return genl_register_family(&nl802154_fam);
}
void nl802154_exit(void)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..6e7baaf814c6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -430,6 +430,14 @@ config INET_UDP_DIAG
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
+config INET_RAW_DIAG
+ tristate "RAW: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for RAW socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
config INET_DIAG_DESTROY
bool "INET: allow privileged process to administratively close sockets"
depends on INET_DIAG
@@ -640,6 +648,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ ---help---
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -674,6 +697,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
@@ -697,6 +723,7 @@ config DEFAULT_TCP_CONG
default "reno" if DEFAULT_RENO
default "dctcp" if DEFAULT_DCTCP
default "cdg" if DEFAULT_CDG
+ default "bbr" if DEFAULT_BBR
default "cubic"
config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..48af58a5686e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_recovery.o \
+ tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -40,7 +40,9 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 55513e654d79..f75069883f2b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -90,7 +90,7 @@
#include <linux/random.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/igmp.h>
@@ -211,24 +211,19 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- /* Check special setups for testing purpose to enable TFO w/o
- * requiring TCP_FASTOPEN sockopt.
+ /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
* Note that only TCP sockets (SOCK_STREAM) will reach here.
- * Also fastopenq may already been allocated because this
- * socket was in TCP_LISTEN state previously but was
- * shutdown() (rather than close()).
+ * Also fastopen backlog may already been set via the option
+ * because the socket was in TCP_LISTEN state previously but
+ * was shutdown() rather than close().
*/
- if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+ if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+ (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
!inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
- if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
- fastopen_queue_tune(sk, backlog);
- else if ((sysctl_tcp_fastopen &
- TFO_SERVER_WO_SOCKOPT2) != 0)
- fastopen_queue_tune(sk,
- ((uint)sysctl_tcp_fastopen) >> 16);
-
+ fastopen_queue_tune(sk, backlog);
tcp_fastopen_init_key_once(true);
}
+
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
@@ -379,8 +374,18 @@ lookup_protocol:
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err)
+ if (err) {
sk_common_release(sk);
+ goto out;
+ }
+ }
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
}
out:
return err;
@@ -538,9 +543,9 @@ EXPORT_SYMBOL(inet_dgram_connect);
static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending += writebias;
/* Basic assumption: if someone sets sk->sk_err, he _must_
@@ -550,13 +555,12 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
*/
while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
release_sock(sk);
- timeo = schedule_timeout(timeo);
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
lock_sock(sk);
if (signal_pending(current) || !timeo)
break;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending -= writebias;
return timeo;
}
@@ -921,6 +925,8 @@ const struct proto_ops inet_stream_ops = {
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
+ .read_sock = tcp_read_sock,
+ .peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -1195,7 +1201,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- bool udpfrag = false, fixedid = false, encap;
+ bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
@@ -1237,7 +1243,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
/* fixed ID is invalid if DF bit is not set */
- if (fixedid && !(iph->frag_off & htons(IP_DF)))
+ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
goto out;
}
@@ -1248,6 +1254,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (IS_ERR_OR_NULL(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1262,9 +1270,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
}
- tot_len = skb_shinfo(skb)->gso_size +
- SKB_GSO_CB(skb)->data_offset +
- skb->head - (unsigned char *)iph;
+
+ if (gso_partial)
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
+ else
+ tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
@@ -1388,7 +1400,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = ops->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..51b27ae09fbd 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1263,7 +1263,7 @@ void __init arp_init(void)
/*
* ax25 -> ASCII conversion
*/
-static char *ax2asc2(ax25_address *a, char *buf)
+static void ax2asc2(ax25_address *a, char *buf)
{
char c, *s;
int n;
@@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf)
*s++ = n + '0';
*s++ = '\0';
- if (*buf == '\0' || *buf == '-')
- return "*";
-
- return buf;
+ if (*buf == '\0' || *buf == '-') {
+ buf[0] = '*';
+ buf[1] = '\0';
+ }
}
#endif /* CONFIG_AX25 */
@@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
}
#endif
sprintf(tbuf, "%pI4", n->primary_key);
- seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n",
tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
read_unlock(&n->lock);
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 72d6f056d863..ae206163c273 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
goto validate_return_locked;
}
+ if (opt_iter + 1 == opt_len) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
tag_len = tag[1];
if (tag_len > (opt_len - opt_iter)) {
err_offset = opt_iter + 1;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 062a67ca9a21..4cd2ee8857d2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -26,7 +26,7 @@
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d95631d09248..20fb25e3027b 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -476,7 +476,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
esph = (void *)skb_push(skb, 4);
*seqhi = esph->spi;
esph->spi = esph->seq_no;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1b25daf8c7f1..7db2ad2e82d3 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -14,7 +14,7 @@
*/
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
@@ -46,6 +46,7 @@
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -85,7 +86,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- if (id == RT_TABLE_LOCAL)
+ if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
alias = fib_new_table(net, RT_TABLE_MAIN);
tb = fib_trie_table(id, alias);
@@ -93,9 +94,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
return NULL;
switch (id) {
- case RT_TABLE_LOCAL:
- rcu_assign_pointer(net->ipv4.fib_local, tb);
- break;
case RT_TABLE_MAIN:
rcu_assign_pointer(net->ipv4.fib_main, tb);
break;
@@ -137,9 +135,6 @@ static void fib_replace_table(struct net *net, struct fib_table *old,
{
#ifdef CONFIG_IP_MULTIPLE_TABLES
switch (new->tb_id) {
- case RT_TABLE_LOCAL:
- rcu_assign_pointer(net->ipv4.fib_local, new);
- break;
case RT_TABLE_MAIN:
rcu_assign_pointer(net->ipv4.fib_main, new);
break;
@@ -157,7 +152,7 @@ static void fib_replace_table(struct net *net, struct fib_table *old,
int fib_unmerge(struct net *net)
{
- struct fib_table *old, *new;
+ struct fib_table *old, *new, *main_table;
/* attempt to fetch local table if it has been allocated */
old = fib_get_table(net, RT_TABLE_LOCAL);
@@ -168,11 +163,21 @@ int fib_unmerge(struct net *net)
if (!new)
return -ENOMEM;
+ /* table is already unmerged */
+ if (new == old)
+ return 0;
+
/* replace merged table with clean table */
- if (new != old) {
- fib_replace_table(net, old, new);
- fib_free_table(old);
- }
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+
+ /* attempt to fetch main table if it has been allocated */
+ main_table = fib_get_table(net, RT_TABLE_MAIN);
+ if (!main_table)
+ return 0;
+
+ /* flush local entries from main table */
+ fib_table_flush_external(main_table);
return 0;
}
@@ -188,26 +193,13 @@ static void fib_flush(struct net *net)
struct fib_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
- flushed += fib_table_flush(tb);
+ flushed += fib_table_flush(net, tb);
}
if (flushed)
rt_cache_flush(net);
}
-void fib_flush_external(struct net *net)
-{
- struct fib_table *tb;
- struct hlist_head *head;
- unsigned int h;
-
- for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist)
- fib_table_flush_external(tb);
- }
-}
-
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -596,13 +588,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCDELRT) {
tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = fib_table_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg);
else
err = -ESRCH;
} else {
tb = fib_new_table(net, cfg.fc_table);
if (tb)
- err = fib_table_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg);
else
err = -ENOBUFS;
}
@@ -629,6 +621,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_UID] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -685,6 +678,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_mx_len = nla_len(attr);
break;
case RTA_MULTIPATH:
+ err = lwtunnel_valid_encap_type_attr(nla_data(attr),
+ nla_len(attr));
+ if (err < 0)
+ goto errout;
cfg->fc_mp = nla_data(attr);
cfg->fc_mp_len = nla_len(attr);
break;
@@ -699,6 +696,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
break;
case RTA_ENCAP_TYPE:
cfg->fc_encap_type = nla_get_u16(attr);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+ if (err < 0)
+ goto errout;
break;
}
}
@@ -725,7 +725,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
}
- err = fib_table_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg);
errout:
return err;
}
@@ -747,7 +747,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
}
- err = fib_table_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg);
errout:
return err;
}
@@ -834,9 +834,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
cfg.fc_scope = RT_SCOPE_HOST;
if (cmd == RTM_NEWROUTE)
- fib_table_insert(tb, &cfg);
+ fib_table_insert(net, tb, &cfg);
else
- fib_table_delete(tb, &cfg);
+ fib_table_delete(net, tb, &cfg);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -1227,6 +1227,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+ net->ipv4.fib_seq = 0;
+
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
@@ -1250,7 +1252,6 @@ static void ip_fib_net_exit(struct net *net)
rtnl_lock();
#ifdef CONFIG_IP_MULTIPLE_TABLES
- RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
#endif
@@ -1261,7 +1262,7 @@ static void ip_fib_net_exit(struct net *net)
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
hlist_del(&tb->tb_hlist);
- fib_table_flush(tb);
+ fib_table_flush(net, tb);
fib_free_table(tb);
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..2e50062f642d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
};
int err;
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
#ifdef CONFIG_IP_ROUTE_CLASSID
if (arg.rule)
@@ -161,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
+static int call_fib_rule_notifiers(struct net *net,
+ enum fib_event_type event_type)
+{
+ struct fib_notifier_info info;
+
+ return call_fib_notifiers(net, event_type, &info);
+}
+
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
FRA_GENERIC_POLICY,
[FRA_FLOW] = { .type = NLA_U32 },
@@ -217,7 +228,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
- fib_flush_external(rule->fr_net);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD);
err = 0;
errout:
@@ -239,7 +250,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
- fib_flush_external(rule->fr_net);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL);
errout:
return err;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e9f56225e53f..9a375b908d01 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -13,7 +13,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
+EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
@@ -1278,8 +1279,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
- if (fi->fib_nh->nh_lwtstate)
- lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
+ if (fi->fib_nh->nh_lwtstate &&
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
+ goto nla_put_failure;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1315,8 +1317,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
- if (nh->nh_lwtstate)
- lwtunnel_fill_encap(skb, nh->nh_lwtstate);
+ if (nh->nh_lwtstate &&
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
+ goto nla_put_failure;
+
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
@@ -1580,7 +1584,8 @@ static bool fib_good_nh(const struct fib_nh *nh)
rcu_read_lock_bh();
- n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
+ n = __ipv4_neigh_lookup_noref(nh->nh_dev,
+ (__force u32)nh->nh_gw);
if (n)
state = n->nud_state;
@@ -1616,8 +1621,13 @@ void fib_select_multipath(struct fib_result *res, int hash)
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, int mp_hash)
{
+ bool oif_check;
+
+ oif_check = (fl4->flowi4_oif == 0 ||
+ fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (res->fi->fib_nhs > 1 && oif_check) {
if (mp_hash < 0)
mp_hash = get_hash_from_flowi4(fl4) >> 1;
@@ -1627,7 +1637,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
#endif
if (!res->prefixlen &&
res->table->tb_num_default > 1 &&
- res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ res->type == RTN_UNICAST && oif_check)
fib_select_default(fl4, res);
if (!fl4->saddr)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..2919d1a10cfd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
#define VERSION "0.409"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -73,6 +73,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
+#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,10 +81,136 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/switchdev.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_notifier_info info;
+
+ if (net->ipv4.fib_has_custom_rules)
+ call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+ fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ net->ipv4.fib_seq++;
+ info->net = net;
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -681,6 +808,13 @@ static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
+ unsigned char slen_max;
+
+ /* only vector 0 can have a suffix length greater than or equal to
+ * tn->pos + tn->bits, the second highest node will have a suffix
+ * length at most of tn->pos + tn->bits - 1
+ */
+ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
/* search though the list of children looking for nodes that might
* have a suffix greater than the one we currently have. This is
@@ -698,12 +832,8 @@ static unsigned char update_suffix(struct key_vector *tn)
slen = n->slen;
i &= ~(stride - 1);
- /* if slen covers all but the last bit we can stop here
- * there will be nothing longer than that since only node
- * 0 and 1 << (bits - 1) could have that as their suffix
- * length.
- */
- if ((slen + 1) >= (tn->pos + tn->bits))
+ /* stop searching if we have hit the maximum possible value */
+ if (slen >= slen_max)
break;
}
@@ -875,39 +1005,27 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
return collapse(t, tn);
/* update parent in case halve failed */
- tp = node_parent(tn);
-
- /* Return if at least one deflate was run */
- if (max_work != MAX_WORK)
- return tp;
-
- /* push the suffix length to the parent node */
- if (tn->slen > tn->pos) {
- unsigned char slen = update_suffix(tn);
-
- if (slen > tp->slen)
- tp->slen = slen;
- }
-
- return tp;
+ return node_parent(tn);
}
-static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
+static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
- while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
- if (update_suffix(tp) > l->slen)
+ unsigned char node_slen = tn->slen;
+
+ while ((node_slen > tn->pos) && (node_slen > slen)) {
+ slen = update_suffix(tn);
+ if (node_slen == slen)
break;
- tp = node_parent(tp);
+
+ tn = node_parent(tn);
+ node_slen = tn->slen;
}
}
-static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
+static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
- /* if this is a new leaf then tn will be NULL and we can sort
- * out parent suffix lengths as a part of trie_rebalance
- */
- while (tn->slen < l->slen) {
- tn->slen = l->slen;
+ while (tn->slen < slen) {
+ tn->slen = slen;
tn = node_parent(tn);
}
}
@@ -1028,6 +1146,7 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp,
}
/* Case 3: n is NULL, and will just insert a new leaf */
+ node_push_suffix(tp, new->fa_slen);
NODE_INIT_PARENT(l, tp);
put_child_root(tp, key, l);
trie_rebalance(t, tp);
@@ -1069,19 +1188,20 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
- leaf_push_suffix(tp, l);
+ node_push_suffix(tp, new->fa_slen);
}
return 0;
}
/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
- unsigned int nlflags = 0;
+ u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1246,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ nlflags &= ~NLM_F_EXCL;
+
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
@@ -1151,6 +1273,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct fib_info *fi_drop;
u8 state;
+ nlflags |= NLM_F_REPLACE;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
@@ -1172,17 +1295,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- err = switchdev_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- kmem_cache_free(fn_alias_kmem, new_fa);
- goto out;
- }
-
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1190,8 +1302,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
goto succeeded;
}
@@ -1203,7 +1320,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND)
- nlflags = NLM_F_APPEND;
+ nlflags |= NLM_F_APPEND;
else
fa = fa_first;
}
@@ -1211,6 +1328,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
+ nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
@@ -1224,30 +1342,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- /* (Optionally) offload fib entry to switch hardware. */
- err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
- cfg->fc_nlflags, tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- goto out_free_new_fa;
- }
-
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_sw_fib_del;
+ goto out_free_new_fa;
if (!plen)
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
+ cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
-out_sw_fib_del:
- switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1470,6 +1580,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
* out parent suffix lengths as a part of trie_rebalance
*/
if (hlist_empty(&l->leaf)) {
+ if (tp->slen == l->slen)
+ node_pull_suffix(tp, tp->pos);
put_child_root(tp, l->key, NULL);
node_free(l);
trie_rebalance(t, tp);
@@ -1482,11 +1594,12 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
- leaf_pull_suffix(tp, l);
+ node_pull_suffix(tp, fa->fa_slen);
}
/* Caller must hold RTNL. */
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
@@ -1539,9 +1652,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
-
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
+ fa_to_delete->fa_info, tos, cfg->fc_type,
+ tb->tb_id, 0);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1713,8 +1826,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
local_l = fib_find_node(lt, &local_tp, l->key);
if (fib_insert_alias(lt, local_tp, local_l, new_fa,
- NULL, l->key))
+ NULL, l->key)) {
+ kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
+ }
}
/* stop loop if key wrapped back to 0 */
@@ -1751,6 +1866,10 @@ void fib_table_flush_external(struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
@@ -1772,8 +1891,6 @@ void fib_table_flush_external(struct fib_table *tb)
}
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
/* if alias was cloned to local then we just
* need to remove the local copy from main
*/
@@ -1785,13 +1902,6 @@ void fib_table_flush_external(struct fib_table *tb)
/* record local slen */
slen = fa->fa_slen;
-
- if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
- continue;
-
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
}
/* update leaf slen */
@@ -1805,7 +1915,7 @@ void fib_table_flush_external(struct fib_table *tb)
}
/* Caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
@@ -1826,6 +1936,10 @@ int fib_table_flush(struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
@@ -1854,9 +1968,11 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1876,6 +1992,62 @@ int fib_table_flush(struct fib_table *tb)
return found;
}
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+ struct fib_table *tb, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ call_fib_entry_notifier(nb, net, event_type, l->key,
+ KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+ fa->fa_type, fa->tb_id, 0);
+ }
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+ struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ fib_leaf_notify(net, l, tb, nb, event_type);
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
+ }
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ fib_table_notify(net, tb, nb, event_type);
+ }
+}
+
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
@@ -2455,22 +2627,19 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
struct key_vector *l, **tp = &iter->tnode;
t_key key;
- /* use cache location of next-to-find key */
+ /* use cached location of previously found key */
if (iter->pos > 0 && pos >= iter->pos) {
- pos -= iter->pos;
key = iter->key;
} else {
- iter->pos = 0;
+ iter->pos = 1;
key = 0;
}
- while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ pos -= iter->pos;
+
+ while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
key = l->key + 1;
iter->pos++;
-
- if (--pos <= 0)
- break;
-
l = NULL;
/* handle unlikely case of a key wrap */
@@ -2479,7 +2648,7 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
}
if (l)
- iter->key = key; /* remember it */
+ iter->key = l->key; /* remember it */
else
iter->pos = 0; /* forget it */
@@ -2507,7 +2676,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
return fib_route_get_idx(iter, *pos);
iter->pos = 0;
- iter->key = 0;
+ iter->key = KEY_MAX;
return SEQ_START_TOKEN;
}
@@ -2516,7 +2685,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
struct key_vector *l = NULL;
- t_key key = iter->key;
+ t_key key = iter->key + 1;
++*pos;
@@ -2525,7 +2694,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
l = leaf_walk_rcu(&iter->tnode, key);
if (l) {
- iter->key = l->key + 1;
+ iter->key = l->key;
iter->pos++;
} else {
iter->pos = 0;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 321d57f825ce..805f6607f8d9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -249,7 +249,7 @@ static struct sk_buff **fou_gro_receive(struct sock *sk,
if (!ops || !ops->callbacks.gro_receive)
goto out_unlock;
- pp = ops->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
@@ -441,7 +441,7 @@ next_proto:
if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
goto out_unlock;
- pp = ops->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
flush = 0;
out_unlock:
@@ -622,16 +622,9 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
return err;
}
-static struct genl_family fou_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = FOU_GENL_NAME,
- .version = FOU_GENL_VERSION,
- .maxattr = FOU_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family fou_nl_family;
-static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
+static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_PORT] = { .type = NLA_U16, },
[FOU_ATTR_AF] = { .type = NLA_U8, },
[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
@@ -831,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = {
},
};
+static struct genl_family fou_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = fou_nl_ops,
+ .n_ops = ARRAY_SIZE(fou_nl_ops),
+};
+
size_t fou_encap_hlen(struct ip_tunnel_encap *e)
{
return sizeof(struct udphdr);
@@ -1086,8 +1090,7 @@ static int __init fou_init(void)
if (ret)
goto exit;
- ret = genl_register_family_with_ops(&fou_nl_family,
- fou_nl_ops);
+ ret = genl_register_family(&fou_nl_family);
if (ret < 0)
goto unregister;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09dbbf1..d5cac99170b1 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo;
+ bool need_csum, ufo, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
gre_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
@@ -227,7 +229,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
skb_gro_postpull_rcsum(skb, greh, grehlen);
- pp = ptype->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
flush = 0;
out_unlock:
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 38abe70e595f..0777ea949223 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -91,7 +91,7 @@
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
@@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -473,11 +474,12 @@ static struct rtable *icmp_route_lookup(struct net *net,
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
- fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
+ fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = __ip_route_output_key_hash(net, fl4,
@@ -502,7 +504,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- if (inet_addr_type_dev_table(net, skb_in->dev,
+ if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
@@ -1045,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb)
if (success) {
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
}
drop:
kfree_skb(skb);
- return 0;
+ return NET_RX_DROP;
csum_error:
__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9b4ca87f70ba..44fd86de2823 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -72,7 +72,7 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -162,7 +162,7 @@ static int unsolicited_report_interval(struct in_device *in_dev)
}
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
@@ -219,9 +219,14 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
int tv = prandom_u32() % in_dev->mr_maxdelay;
+ unsigned long exp = jiffies + tv + 2;
+
+ if (in_dev->mr_gq_running &&
+ time_after_eq(exp, (in_dev->mr_gq_timer).expires))
+ return;
in_dev->mr_gq_running = 1;
- if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ if (!mod_timer(&in_dev->mr_gq_timer, exp))
in_dev_hold(in_dev);
}
@@ -472,6 +477,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
continue;
}
+ /* Based on RFC3376 5.1. Should not send source-list change
+ * records when there is a filter mode change.
+ */
+ if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
+ (!gdeleted && pmc->crcount)) &&
+ (type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
+ goto decrease_sf_crcount;
+
/* clear marks on query responses */
if (isquery)
psf->sf_gsresp = 0;
@@ -499,6 +513,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
scount++; stotal++;
if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
@@ -1120,10 +1135,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
spin_unlock_bh(&in_dev->mc_tomb_lock);
}
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+/*
+ * restore ip_mc_list deleted records
+ */
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc, *pmc_prev;
- struct ip_sf_list *psf, *psf_next;
+ struct ip_sf_list *psf;
+ struct net *net = dev_net(in_dev->dev);
+ __be32 multiaddr = im->multiaddr;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
@@ -1139,16 +1159,27 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
in_dev->mc_tomb = pmc->next;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ spin_lock_bh(&im->lock);
if (pmc) {
- for (psf = pmc->tomb; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ im->interface = pmc->interface;
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ im->sfmode = pmc->sfmode;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ im->tomb = pmc->tomb;
+ im->sources = pmc->sources;
+ for (psf = im->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = im->crcount;
}
in_dev_put(pmc->interface);
kfree(pmc);
}
+ spin_unlock_bh(&im->lock);
}
+/*
+ * flush ip_mc_list deleted records
+ */
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *nextpmc;
@@ -1356,7 +1387,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
ip_mc_hash_add(in_dev, im);
#ifdef CONFIG_IP_MULTICAST
- igmpv3_del_delrec(in_dev, im->multiaddr);
+ igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
if (!in_dev->dead)
@@ -1616,8 +1647,12 @@ void ip_mc_remap(struct in_device *in_dev)
ASSERT_RTNL();
- for_each_pmc_rtnl(in_dev, pmc)
+ for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, pmc);
+#endif
igmp_group_added(pmc);
+ }
}
/* Device going down */
@@ -1638,7 +1673,6 @@ void ip_mc_down(struct in_device *in_dev)
in_dev->mr_gq_running = 0;
if (del_timer(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
- igmpv3_clear_delrec(in_dev);
#endif
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
@@ -1678,8 +1712,12 @@ void ip_mc_up(struct in_device *in_dev)
#endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
- for_each_pmc_rtnl(in_dev, pmc)
+ for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, pmc);
+#endif
igmp_group_added(pmc);
+ }
}
/*
@@ -1694,13 +1732,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
/* Deactivate timers */
ip_mc_down(in_dev);
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_clear_delrec(in_dev);
+#endif
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
-
- /* We've dropped the groups in ip_mc_down already */
- ip_mc_clear_src(i);
ip_ma_put(i);
}
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 61a9deec2993..19ea045c50ed 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -45,11 +45,12 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb, bool relax)
+ const struct inet_bind_bucket *tb, bool relax,
+ bool reuseport_ok)
{
struct sock *sk2;
- int reuse = sk->sk_reuse;
- int reuseport = sk->sk_reuseport;
+ bool reuse = sk->sk_reuse;
+ bool reuseport = !!sk->sk_reuseport && reuseport_ok;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
@@ -105,6 +106,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct inet_bind_bucket *tb;
kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
+ bool reuseport_ok = !!snum;
if (port) {
have_port:
@@ -165,7 +167,8 @@ other_parity_scan:
smallest_size = tb->num_owners;
smallest_port = port;
}
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
+ reuseport_ok))
goto tb_found;
goto next_port;
}
@@ -206,13 +209,14 @@ tb_found:
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1)
goto success;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true,
+ reuseport_ok)) {
if ((reuse ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
uid_eq(tb->fastuid, uid))) &&
- smallest_size != -1 && --attempts >= 0) {
+ !snum && smallest_size != -1 && --attempts >= 0) {
spin_unlock_bh(&head->lock);
goto again;
}
@@ -415,7 +419,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -452,7 +456,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 38c2c47fe0e8..4dea33e5f295 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -45,6 +45,7 @@ struct inet_diag_entry {
u16 family;
u16 userlocks;
u32 ifindex;
+ u32 mark;
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -98,6 +99,7 @@ static size_t inet_sk_attr_size(void)
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(4) /* INET_DIAG_MARK */
+ nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -108,7 +110,8 @@ static size_t inet_sk_attr_size(void)
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
- struct user_namespace *user_ns)
+ struct user_namespace *user_ns,
+ bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -135,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ goto errout;
+
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
@@ -148,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
@@ -174,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -193,6 +200,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
goto errout;
+ /*
+ * RAW sockets might have user-defined protocols assigned,
+ * so report the one supplied on socket creation.
+ */
+ if (sk->sk_type == SOCK_RAW) {
+ if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+ goto errout;
+ }
+
if (!icsk) {
handler->idiag_get_info(sk, r, NULL);
goto out;
@@ -273,10 +289,11 @@ static int inet_csk_diag_fill(struct sock *sk,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
- user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+ portid, seq, nlmsg_flags, unlh, net_admin);
}
static int inet_twsk_diag_fill(struct sock *sk,
@@ -318,8 +335,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
+ struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
@@ -333,7 +351,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
inet_diag_msg_common_fill(r, sk);
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+ r->idiag_retrans = reqsk->num_retrans;
BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
offsetof(struct sock, sk_cookie));
@@ -345,6 +363,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_uid = 0;
r->idiag_inode = 0;
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark))
+ return -EMSGSIZE;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -353,7 +375,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -361,10 +383,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
if (sk->sk_state == TCP_NEW_SYN_RECV)
return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
}
struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -434,7 +456,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -580,6 +603,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
yes = 0;
break;
}
+ case INET_DIAG_BC_MARK_COND: {
+ struct inet_diag_markcond *cond;
+
+ cond = (struct inet_diag_markcond *)(op + 1);
+ if ((entry->mark & cond->mask) != cond->mark)
+ yes = 0;
+ break;
+ }
}
if (yes) {
@@ -624,6 +655,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry.dport = ntohs(inet->inet_dport);
entry.ifindex = sk->sk_bound_dev_if;
entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
+ if (sk_fullsock(sk))
+ entry.mark = sk->sk_mark;
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+ else
+ entry.mark = 0;
return inet_diag_bc_run(bc, &entry);
}
@@ -706,10 +743,25 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op,
return true;
}
-static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ *min_len += sizeof(struct inet_diag_markcond);
+ return len >= *min_len;
+}
+
+static int inet_diag_bc_audit(const struct nlattr *attr,
+ const struct sk_buff *skb)
{
- const void *bc = bytecode;
- int len = bytecode_len;
+ bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ const void *bytecode, *bc;
+ int bytecode_len, len;
+
+ if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ return -EINVAL;
+
+ bytecode = bc = nla_data(attr);
+ len = bytecode_len = nla_len(attr);
while (len > 0) {
int min_len = sizeof(struct inet_diag_bc_op);
@@ -732,6 +784,12 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_MARK_COND:
+ if (!net_admin)
+ return -EPERM;
+ if (!valid_markcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
@@ -760,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+ const struct nlattr *bc,
+ bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -768,7 +827,8 @@ static int inet_csk_diag_dump(struct sock *sk,
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+ net_admin);
}
static void twsk_build_assert(void)
@@ -801,9 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
- int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
+ int i, num, s_i, s_num;
+ struct sock *sk;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@@ -811,16 +873,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN))
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
- struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
- spin_lock_bh(&ilb->lock);
+ spin_lock(&ilb->lock);
sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
@@ -840,25 +901,18 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (r->id.idiag_dport ||
- cb->args[3] > 0)
- goto next_listen;
-
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
- spin_unlock_bh(&ilb->lock);
+ if (inet_csk_diag_dump(sk, skb, cb, r,
+ bc, net_admin) < 0) {
+ spin_unlock(&ilb->lock);
goto done;
}
next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
++num;
}
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
}
skip_listen_ht:
cb->args[0] = 1;
@@ -868,13 +922,14 @@ skip_listen_ht:
if (!(idiag_states & ~TCPF_LISTEN))
goto out;
+#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct hlist_nulls_node *node;
- struct sock *sk;
-
- num = 0;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
if (hlist_nulls_empty(&head->chain))
continue;
@@ -882,9 +937,12 @@ skip_listen_ht:
if (i > s_i)
s_num = 0;
+next_chunk:
+ num = 0;
+ accum = 0;
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- int state, res;
+ int state;
if (!net_eq(sock_net(sk), net))
continue;
@@ -908,21 +966,35 @@ skip_listen_ht:
if (!inet_diag_bc_sk(bc, sk))
goto next_normal;
- res = sk_diag_fill(sk, skb, r,
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh);
- if (res < 0) {
- spin_unlock_bh(lock);
- goto done;
+ cb->nlh, net_admin);
+ if (res < 0)
+ num = num_arr[idx];
}
-next_normal:
- ++num;
+ sock_gen_put(sk_arr[idx]);
}
-
- spin_unlock_bh(lock);
+ if (res < 0)
+ break;
cond_resched();
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
}
done:
@@ -1020,13 +1092,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) {
struct nlattr *attr;
+ int err;
attr = nlmsg_find_attr(nlh, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (!attr ||
- nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
- inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
- return -EINVAL;
+ err = inet_diag_bc_audit(attr, skb);
+ if (err)
+ return err;
}
{
struct netlink_dump_control c = {
@@ -1051,13 +1123,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
+ int err;
attr = nlmsg_find_attr(h, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (!attr ||
- nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
- inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
- return -EINVAL;
+ err = inet_diag_bc_audit(attr, skb);
+ if (err)
+ return err;
}
{
struct netlink_dump_control c = {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 77c20a489218..ca97835bfec4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -25,6 +25,7 @@
#include <net/inet_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/tcp.h>
#include <net/sock_reuseport.h>
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
@@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
- const int dif)
+ const int dif, bool exact_dif)
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);
@@ -186,7 +187,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
- if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if || exact_dif) {
if (sk->sk_bound_dev_if != dif)
return -1;
score += 4;
@@ -215,11 +216,12 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore = 0, matches = 0, reuseport = 0;
+ bool exact_dif = inet_exact_dif_match(net, skb);
struct sock *sk, *result = NULL;
u32 phash = 0;
sk_for_each_rcu(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif);
+ score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
if (score > hiscore) {
reuseport = sk->sk_reuseport;
if (reuseport) {
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 8b4ffd216839..9f0a7b96646f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
- IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS;
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..c9c1cb635d9a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -17,7 +17,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
@@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
-static int ipgre_net_id __read_mostly;
-static int gre_tap_net_id __read_mostly;
+static unsigned int ipgre_net_id __read_mostly;
+static unsigned int gre_tap_net_id __read_mostly;
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, &tpi);
}
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
__be64 tun_id;
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
- tun_id = key_to_tunnel_id(tpi->key);
+ tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
gre_build_header(skb, tunnel_hlen, flags, proto,
- tunnel_id_to_key(tun_info->key.tun_id), 0);
+ tunnel_id_to_key32(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4d158ff1def1..93157f2f4758 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -15,7 +15,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index dde37fb340bf..b67719f45953 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -42,7 +42,7 @@
* Hirokazu Takahashi: sendfile() on UDP works now.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -73,6 +73,8 @@
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
+#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
@@ -98,6 +100,16 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
+ skb->protocol = htons(ETH_P_IP);
+
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
@@ -197,6 +209,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
skb = skb2;
}
+ if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+ int res = lwtunnel_xmit(skb);
+
+ if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+ return res;
+ }
+
rcu_read_lock_bh();
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
@@ -223,19 +242,23 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *segs;
int ret = 0;
- /* common case: fragmentation of segments is not allowed,
- * or seglen is <= mtu
+ /* common case: seglen is <= mtu
*/
- if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) ||
- skb_gso_validate_mtu(skb, mtu))
+ if (skb_gso_validate_mtu(skb, mtu))
return ip_finish_output2(net, sk, skb);
- /* Slowpath - GSO segment length is exceeding the dst MTU.
+ /* Slowpath - GSO segment length exceeds the egress MTU.
*
- * This can happen in two cases:
- * 1) TCP GRO packet, DF bit not set
- * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
- * from host network stack.
+ * This can happen in several cases:
+ * - Forwarding of a TCP GRO skb, when DF flag is not set.
+ * - Forwarding of an skb that arrived on a virtualization interface
+ * (virtio-net/vhost/tap) with TSO/GSO size set by other network
+ * stack.
+ * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
+ * interface with a smaller MTU.
+ * - Arriving GRO skb (or GSO skb in a virtualized environment) that is
+ * bridged to a NETIF_F_TSO tunnel stacked over an interface with an
+ * insufficent MTU.
*/
features = netif_skb_features(skb);
BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
@@ -265,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -283,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ return dev_loopback_xmit(net, sk, skb);
+}
+
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
@@ -320,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -336,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
@@ -482,7 +526,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
skb_copy_secmark(to, from);
@@ -522,7 +566,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
{
struct iphdr *iph;
int ptr;
- struct net_device *dev;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len, ll_rs;
int offset;
@@ -530,8 +573,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rtable *rt = skb_rtable(skb);
int err = 0;
- dev = rt->dst.dev;
-
/* for offloaded checksums cleanup checksum before fragmentation */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
@@ -564,7 +605,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
*/
if (skb_has_frag_list(skb)) {
struct sk_buff *frag, *frag2;
- int first_len = skb_pagelen(skb);
+ unsigned int first_len = skb_pagelen(skb);
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -785,11 +826,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
struct msghdr *msg = from;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (copy_from_iter(to, len, &msg->msg_iter) != len)
+ if (!copy_from_iter_full(to, len, &msg->msg_iter))
return -EFAULT;
} else {
__wsum csum = 0;
- if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
+ if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
@@ -917,7 +958,7 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
@@ -1575,7 +1616,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -1587,6 +1629,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_mark = fl4.flowi4_mark;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
if (unlikely(err)) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 71a52f4d4cff..900011709e3b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -44,7 +44,7 @@
#include <net/ip_fib.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* SOL_IP control messages.
@@ -97,8 +97,19 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
}
+static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
+{
+ int val;
+
+ if (IPCB(skb)->frag_max_size == 0)
+ return;
+
+ val = IPCB(skb)->frag_max_size;
+ put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
+}
+
static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
- int offset)
+ int tlen, int offset)
{
__wsum csum = skb->csum;
@@ -106,8 +117,9 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
return;
if (offset != 0)
- csum = csum_sub(csum, csum_partial(skb_transport_header(skb),
- offset, 0));
+ csum = csum_sub(csum,
+ csum_partial(skb_transport_header(skb) + tlen,
+ offset, 0));
put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
}
@@ -136,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
__be16 *ports = (__be16 *)skb_transport_header(skb);
- if (skb_transport_offset(skb) + 4 > skb->len)
+ if (skb_transport_offset(skb) + 4 > (int)skb->len)
return;
/* All current transport protocols have the port numbers in the
@@ -152,10 +164,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
- int offset)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(skb->sk);
+ struct inet_sock *inet = inet_sk(sk);
unsigned int flags = inet->cmsg_flags;
/* Ordered by supposed usage frequency */
@@ -216,7 +228,10 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
}
if (flags & IP_CMSG_CHECKSUM)
- ip_cmsg_recv_checksum(msg, skb, offset);
+ ip_cmsg_recv_checksum(msg, skb, tlen, offset);
+
+ if (flags & IP_CMSG_RECVFRAGSIZE)
+ ip_cmsg_recv_fragsize(msg, skb);
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
@@ -284,9 +299,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
ipc->ttl = val;
break;
case IP_TOS:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+ val = *(int *)CMSG_DATA(cmsg);
+ else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+ val = *(u8 *)CMSG_DATA(cmsg);
+ else
return -EINVAL;
- val = *(int *)CMSG_DATA(cmsg);
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
@@ -610,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_MULTICAST_LOOP:
case IP_RECVORIGDSTADDR:
case IP_CHECKSUM:
+ case IP_RECVFRAGSIZE:
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *) optval))
return -EFAULT;
@@ -722,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
}
break;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ goto e_inval;
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
+ break;
case IP_TOS: /* This sets both TOS and Precedence */
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
@@ -1198,14 +1225,27 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* which has interface index (iif) as the first member of the
* underlying inet{6}_skb_parm struct. This code then overlays
* PKTINFO_SKB_CB and in_pktinfo also has iif as the first
- * element so the iif is picked up from the prior IPCB
+ * element so the iif is picked up from the prior IPCB. If iif
+ * is the loopback interface, then return the sending interface
+ * (e.g., process binds socket to eth0 for Tx which is
+ * redirected to loopback in the rtable/dst).
*/
+ if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+ pktinfo->ipi_ifindex = inet_iif(skb);
+
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- skb_dst_drop(skb);
+ /* We need to keep the dst for __ip_options_echo()
+ * We could restrict the test to opt.ts_needtime || opt.srr,
+ * but the following is good enough as IP options are not often used.
+ */
+ if (unlikely(IPCB(skb)->opt.optlen))
+ skb_dst_force(skb);
+ else
+ skb_dst_drop(skb);
}
int ip_setsockopt(struct sock *sk, int level,
@@ -1353,6 +1393,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_CHECKSUM:
val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
break;
+ case IP_RECVFRAGSIZE:
+ val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
+ break;
case IP_TOS:
val = inet->tos;
break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..823abaef006b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -357,6 +358,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
{
struct ip_tunnel *nt;
struct net_device *dev;
+ int t_hlen;
BUG_ON(!itn->fb_tunnel_dev);
dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
@@ -366,6 +368,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
dev->mtu = ip_tunnel_bind_dev(dev);
nt = netdev_priv(dev);
+ t_hlen = nt->hlen + sizeof(struct iphdr);
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
ip_tunnel_add(itn, nt);
return nt;
}
@@ -546,6 +551,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ u32 headroom = sizeof(struct iphdr);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ const struct iphdr *inner_iph;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ __be16 df = 0;
+ u8 tos, ttl;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto tx_error;
+ key = &tun_info->key;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ tos = key->tos;
+ if (tos == 1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+ init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+ RT_TOS(tos), tunnel->parms.link);
+ if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_error;
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = key->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+ if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+ df = htons(IP_DF);
+ else if (skb->protocol == htons(ETH_P_IP))
+ df = inner_iph->frag_off & htons(IP_DF);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ if (headroom > dev->needed_headroom)
+ dev->needed_headroom = headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ goto tx_dropped;
+ }
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+ key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ return;
+tx_error:
+ dev->stats.tx_errors++;
+ goto kfree;
+tx_dropped:
+ dev->stats.tx_dropped++;
+kfree:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
@@ -853,7 +933,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
- if (new_mtu < 68)
+ if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
if (new_mtu > max_mtu) {
@@ -914,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev)
}
EXPORT_SYMBOL(ip_tunnel_get_iflink);
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
@@ -1116,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev)
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
/* Do least required initialization, rest of init is done in tunnel_init call */
-void ip_tunnel_setup(struct net_device *dev, int net_id)
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
tunnel->ip_tnl_net_id = net_id;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0f227db0e9ac..0fd1976ab63b 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -63,26 +63,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
struct net_device *dev = skb->dev;
- int skb_iif = skb->skb_iif;
struct iphdr *iph;
int err;
skb_scrub_packet(skb, xnet);
- skb_clear_hash(skb);
+ skb_clear_hash_if_not_l4(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- if (skb_iif && !(df & htons(IP_DF))) {
- /* Arrived from an ingress interface, got encapsulated, with
- * fragmentation of encapulating frames allowed.
- * If skb is gso, the resulting encapsulated network segments
- * may exceed dst mtu.
- * Allow IP Fragmentation of segments.
- */
- IPCB(skb)->flags |= IPSKB_FRAG_SEGS;
- }
-
/* Push down and install the IP header. */
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
@@ -324,6 +313,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
.fill_encap = ip_tun_fill_encap_info,
.get_encap_size = ip_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
@@ -414,6 +404,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
.fill_encap = ip6_tun_fill_encap_info,
.get_encap_size = ip6_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
};
void __init ip_tunnel_core_init(void)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5d7944f394d9..8b14f1404c8f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -46,7 +46,7 @@
static struct rtnl_link_ops vti_link_ops __read_mostly;
-static int vti_net_id __read_mostly;
+static unsigned int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 1d71c40eaaf3..fd9f34bbd740 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,7 @@
#include <net/ipconfig.h>
#include <net/route.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <asm/processor.h>
@@ -85,7 +85,6 @@
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
#define CONF_SEND_RETRIES 6 /* Send six requests per open */
-#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */
#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
@@ -188,7 +187,7 @@ struct ic_device {
};
static struct ic_device *ic_first_dev __initdata; /* List of open device */
-static struct net_device *ic_dev __initdata; /* Selected device */
+static struct ic_device *ic_dev __initdata; /* Selected device */
static bool __init ic_is_init_dev(struct net_device *dev)
{
@@ -307,7 +306,7 @@ static void __init ic_close_devs(void)
while ((d = next)) {
next = d->next;
dev = d->dev;
- if (dev != ic_dev && !netdev_uses_dsa(dev)) {
+ if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) {
pr_debug("IP-Config: Downing %s\n", dev->name);
dev_change_flags(dev, d->flags);
}
@@ -372,7 +371,7 @@ static int __init ic_setup_if(void)
int err;
memset(&ir, 0, sizeof(ir));
- strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name);
set_sockaddr(sin, ic_myaddr, 0);
if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
pr_err("IP-Config: Unable to set interface address (%d)\n",
@@ -396,7 +395,7 @@ static int __init ic_setup_if(void)
* out, we'll try to muddle along.
*/
if (ic_dev_mtu != 0) {
- strcpy(ir.ifr_name, ic_dev->name);
+ strcpy(ir.ifr_name, ic_dev->dev->name);
ir.ifr_mtu = ic_dev_mtu;
if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
@@ -568,7 +567,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
goto drop_unlock;
/* We have a winner! */
- ic_dev = dev;
+ ic_dev = d;
if (ic_myaddr == NONE)
ic_myaddr = tip;
ic_servaddr = sip;
@@ -655,8 +654,6 @@ static struct packet_type bootp_packet_type __initdata = {
.func = ic_bootp_recv,
};
-static __be32 ic_dev_xid; /* Device under configuration */
-
/*
* Initialize DHCP/BOOTP extension fields in the request.
*/
@@ -666,14 +663,14 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
#ifdef IPCONFIG_DHCP
static void __init
-ic_dhcp_init_options(u8 *options)
+ic_dhcp_init_options(u8 *options, struct ic_device *d)
{
u8 mt = ((ic_servaddr == NONE)
? DHCPDISCOVER : DHCPREQUEST);
u8 *e = options;
int len;
- pr_debug("DHCP: Sending message type %d\n", mt);
+ pr_debug("DHCP: Sending message type %d (%s)\n", mt, d->dev->name);
memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
e += 4;
@@ -857,7 +854,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/* add DHCP options or BOOTP extensions */
#ifdef IPCONFIG_DHCP
if (ic_proto_enabled & IC_USE_DHCP)
- ic_dhcp_init_options(b->exten);
+ ic_dhcp_init_options(b->exten, d);
else
#endif
ic_bootp_init_ext(b->exten);
@@ -1033,14 +1030,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Is it a reply to our BOOTP request? */
if (b->op != BOOTP_REPLY ||
b->xid != d->xid) {
- net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
- b->op, b->xid);
- goto drop_unlock;
- }
-
- /* Is it a reply for the device we are configuring? */
- if (b->xid != ic_dev_xid) {
- net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us on %s, op[%x] xid[%x]\n",
+ d->dev->name, b->op, b->xid);
goto drop_unlock;
}
@@ -1075,7 +1066,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
}
}
- pr_debug("DHCP: Got message type %d\n", mt);
+ pr_debug("DHCP: Got message type %d (%s)\n", mt, d->dev->name);
switch (mt) {
case DHCPOFFER:
@@ -1130,7 +1121,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
}
/* We have a winner! */
- ic_dev = dev;
+ ic_dev = d;
ic_myaddr = b->your_ip;
ic_servaddr = b->server_ip;
ic_addrservaddr = b->iph.saddr;
@@ -1225,9 +1216,6 @@ static int __init ic_dynamic(void)
timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
for (;;) {
#ifdef IPCONFIG_BOOTP
- /* Track the device we are configuring */
- ic_dev_xid = d->xid;
-
if (do_bootp && (d->able & IC_BOOTP))
ic_bootp_send_if(d, jiffies - start_jiffies);
#endif
@@ -1236,15 +1224,19 @@ static int __init ic_dynamic(void)
ic_rarp_send_if(d);
#endif
- jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
- while (time_before(jiffies, jiff) && !ic_got_reply)
- schedule_timeout_uninterruptible(1);
+ if (!d->next) {
+ jiff = jiffies + timeout;
+ while (time_before(jiffies, jiff) && !ic_got_reply)
+ schedule_timeout_uninterruptible(1);
+ }
#ifdef IPCONFIG_DHCP
/* DHCP isn't done until we get a DHCPACK. */
if ((ic_got_reply & IC_BOOTP) &&
(ic_proto_enabled & IC_USE_DHCP) &&
ic_dhcp_msgtype != DHCPACK) {
ic_got_reply = 0;
+ /* continue on device that got the reply */
+ d = ic_dev;
pr_cont(",");
continue;
}
@@ -1487,7 +1479,7 @@ static int __init ip_auto_config(void)
#endif /* IPCONFIG_DYNAMIC */
} else {
/* Device selected manually or only one device -> use it */
- ic_dev = ic_first_dev->dev;
+ ic_dev = ic_first_dev;
}
addr = root_nfs_parse_addr(root_server_path);
@@ -1501,14 +1493,6 @@ static int __init ip_auto_config(void)
return -1;
/*
- * Close all network devices except the device we've
- * autoconfigured and set up routes.
- */
- ic_close_devs();
- if (ic_setup_if() < 0 || ic_setup_routes() < 0)
- return -1;
-
- /*
* Record which protocol was actually used.
*/
#ifdef IPCONFIG_DYNAMIC
@@ -1522,7 +1506,7 @@ static int __init ip_auto_config(void)
pr_info("IP-Config: Complete:\n");
pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
- ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr,
+ ic_dev->dev->name, ic_dev->dev->addr_len, ic_dev->dev->dev_addr,
&ic_myaddr, &ic_netmask, &ic_gateway);
pr_info(" host=%s, domain=%s, nis-domain=%s\n",
utsname()->nodename, ic_domain, utsname()->domainname);
@@ -1542,7 +1526,18 @@ static int __init ip_auto_config(void)
pr_cont("\n");
#endif /* !SILENT */
- return 0;
+ /*
+ * Close all network devices except the device we've
+ * autoconfigured and set up routes.
+ */
+ if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+ err = -1;
+ else
+ err = 0;
+
+ ic_close_devs();
+
+ return err;
}
late_initcall(ip_auto_config);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..00d4229b6954 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -96,7 +96,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
@@ -115,12 +115,13 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-static int ipip_net_id __read_mostly;
+static unsigned int ipip_net_id __read_mostly;
static int ipip_tunnel_init(struct net_device *dev);
static struct rtnl_link_ops ipip_link_ops __read_mostly;
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ struct metadata_dst *tun_dst = NULL;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tpi = &ipip_tpi;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ if (tunnel->collect_md) {
+ tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+ if (!tun_dst)
+ return 0;
+ }
+ return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
}
return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, ipproto);
- ip_tunnel_xmit(skb, dev, tiph, ipproto);
+ if (tunnel->collect_md)
+ ip_md_tunnel_xmit(skb, dev, ipproto);
+ else
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
}
static void ipip_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm *parms, bool *collect_md)
{
memset(parms, 0, sizeof(*parms));
parms->iph.version = 4;
parms->iph.protocol = IPPROTO_IPIP;
parms->iph.ihl = 5;
+ *collect_md = false;
if (!data)
return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ *collect_md = true;
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
if (ipip_netlink_encap_parms(data, &ipencap)) {
- struct ip_tunnel *t = netdev_priv(dev);
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipip_netlink_parms(data, &p);
+ ipip_netlink_parms(data, &p, &t->collect_md);
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ bool collect_md;
if (ipip_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
- ipip_netlink_parms(data, &p);
+ ipip_netlink_parms(data, &p, &collect_md);
+ if (collect_md)
+ return -EINVAL;
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
tunnel->encap.flags))
goto nla_put_failure;
+ if (tunnel->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5f006e13de56..efc1e76d4977 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,7 @@
*
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/capability.h>
#include <linux/errno.h>
@@ -137,6 +137,9 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp4));
+
err = fib_rules_lookup(net->ipv4.mr_rules_ops,
flowi4_to_flowi(flp4), 0, &arg);
if (err < 0)
@@ -163,7 +166,9 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
return -EINVAL;
}
- mrt = ipmr_get_table(rule->fr_net, rule->table);
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = ipmr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -1749,7 +1754,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
vif->dev->stats.tx_bytes += skb->len;
}
- IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS;
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
* not only before forwarding, but after forwarding on all output
@@ -1809,6 +1814,12 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
/* Wrong interface: drop packet and (maybe) send PIM assert. */
if (mrt->vif_table[vif].dev != skb->dev) {
+ struct net_device *mdev;
+
+ mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
+ if (mdev == skb->dev)
+ goto forward;
+
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -2053,7 +2064,7 @@ static int pim_rcv(struct sk_buff *skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
(pim->flags & PIM_NULL_REGISTER) ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3776ff6749f..b3cc1335adbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
if (addr_type == RTN_UNSPEC)
- addr_type = inet_addr_type(net, saddr);
+ addr_type = inet_addr_type_dev_table(net, dev, saddr);
if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
flags |= FLOWI_FLAG_ANYSRC;
else
@@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ if (!fl4.flowi4_oif)
+ fl4.flowi4_oif = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_flags = flags;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index c187c60e3e0c..c11eb1744ab1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -25,16 +25,11 @@ config NF_CONNTRACK_IPV4
To compile it as a module, choose M here. If unsure, say N.
-config NF_CONNTRACK_PROC_COMPAT
- bool "proc/sysctl compatibility with old connection tracking"
- depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
- default y
+config NF_SOCKET_IPV4
+ tristate "IPv4 socket lookup support"
help
- This option enables /proc and sysctl compatibility with the old
- layer 3 dependent connection tracking. This is needed to keep
- old programs that have not been adapted to the new names working.
-
- If unsure, say Y.
+ This option enables the IPv4 socket lookup infrastructure. This is
+ is required by the iptables socket match.
if NF_TABLES
@@ -65,6 +60,14 @@ config NFT_DUP_IPV4
help
This module enables IPv4 packet duplication support for nf_tables.
+config NFT_FIB_IPV4
+ select NFT_FIB
+ tristate "nf_tables fib / ip route lookup support"
+ help
+ This module enables IPv4 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+
endif # NF_TABLES_IPV4
config NF_TABLES_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 87b073da14c9..f462fee66ac8 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -4,11 +4,6 @@
# objects for l3 independent conntrack
nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
-ifeq ($(CONFIG_PROC_FS),y)
-nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
-endif
-endif
# connection tracking
obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
@@ -19,6 +14,8 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+
# logging
obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
@@ -39,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b31df597fd37..a467e1236c43 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -24,7 +24,7 @@
#include <linux/err.h>
#include <net/compat.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
@@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
*/
e = get_entry(table_base, private->hook_entry[hook]);
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.hooknum = hook;
- acpar.family = NFPROTO_ARP;
+ acpar.state = state;
acpar.hotdrop = false;
arp = arp_hdr(skb);
@@ -415,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name)
}
static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
- unsigned long pcnt;
int ret;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
@@ -443,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -523,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -532,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
static int translate_table(struct xt_table_info *newinfo, void *entry0,
const struct arpt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct arpt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -594,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size);
+ ret = find_check_entry(iter, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -809,7 +805,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -838,7 +834,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(NFPROTO_ARP);
@@ -863,7 +859,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
@@ -875,7 +871,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -902,8 +898,8 @@ static int __do_replace(struct net *net, const char *name,
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1018,8 +1014,8 @@ static int do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1201,8 +1197,8 @@ static int translate_compat_table(struct xt_table_info **pinfo,
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
- newinfo->hook_entry[i] = info->hook_entry[i];
- newinfo->underflow[i] = info->underflow[i];
+ newinfo->hook_entry[i] = compatr->hook_entry[i];
+ newinfo->underflow[i] = compatr->underflow[i];
}
entry1 = newinfo->entries;
pos = entry1;
@@ -1408,7 +1404,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
@@ -1423,7 +1419,7 @@ static int compat_get_entries(struct net *net,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(NFPROTO_ARP);
return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f993545a3373..91656a1d8fbd 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -20,7 +20,7 @@
#include <linux/icmp.h>
#include <net/ip.h>
#include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
@@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = {
.u = {
.log = {
.level = 4,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.family = NFPROTO_IPV4;
- acpar.hooknum = hook;
+ acpar.state = state;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
local_bh_disable();
@@ -535,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -543,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -586,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -674,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -683,6 +677,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ipt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -742,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -977,7 +973,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1007,7 +1003,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(AF_INET);
@@ -1032,7 +1028,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1043,7 +1039,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -1068,8 +1064,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1184,8 +1180,8 @@ do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1630,7 +1626,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
ret = compat_table_info(private, &info);
@@ -1644,7 +1640,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(AF_INET);
return ret;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4a9e6db9df8d..0a783cd73faf 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -62,7 +62,7 @@ struct clusterip_config {
static const struct file_operations clusterip_proc_fops;
#endif
-static int clusterip_net_id __read_mostly;
+static unsigned int clusterip_net_id __read_mostly;
struct clusterip_net {
struct list_head configs;
@@ -144,6 +144,11 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
rcu_read_lock_bh();
c = __clusterip_config_find(net, clusterip);
if (c) {
+#ifdef CONFIG_PROC_FS
+ if (!c->pde)
+ c = NULL;
+ else
+#endif
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry)
@@ -166,14 +171,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c,
static struct clusterip_config *
clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
- struct net_device *dev)
+ struct net_device *dev)
{
+ struct net *net = dev_net(dev);
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
- return NULL;
+ return ERR_PTR(-ENOMEM);
c->dev = dev;
c->clusterip = ip;
@@ -185,6 +191,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
atomic_set(&c->refcount, 1);
atomic_set(&c->entries, 1);
+ spin_lock_bh(&cn->lock);
+ if (__clusterip_config_find(net, ip)) {
+ spin_unlock_bh(&cn->lock);
+ kfree(c);
+
+ return ERR_PTR(-EBUSY);
+ }
+
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
+
#ifdef CONFIG_PROC_FS
{
char buffer[16];
@@ -195,16 +212,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
+ spin_lock_bh(&cn->lock);
+ list_del_rcu(&c->list);
+ spin_unlock_bh(&cn->lock);
kfree(c);
- return NULL;
+
+ return ERR_PTR(-ENOMEM);
}
}
#endif
- spin_lock_bh(&cn->lock);
- list_add_rcu(&c->list, &cn->configs);
- spin_unlock_bh(&cn->lock);
-
return c;
}
@@ -410,16 +427,16 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
config = clusterip_config_init(cipinfo,
e->ip.dst.s_addr, dev);
- if (!config) {
+ if (IS_ERR(config)) {
dev_put(dev);
- return -ENOMEM;
+ return PTR_ERR(config);
}
dev_mc_add(config->dev, config->clustermac);
}
}
cipinfo->config = config;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -444,7 +461,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
clusterip_config_put(cipinfo->config);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_get(par->net, par->family);
}
#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index da7f02a0b868..a03e4e7ef5f9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
@@ -55,7 +55,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
- return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
+ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
+ xt_out(par));
+}
+
+static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target masquerade_tg_reg __read_mostly = {
@@ -66,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
+ .destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
};
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1d16c0f28df0..8bd0d7b26632 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,7 +34,7 @@ static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_reject_info *reject = par->targinfo;
- int hook = par->hooknum;
+ int hook = xt_hooknum(par);
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(par->net, skb, hook);
+ nf_send_reset(xt_net(par), skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index db5b87509446..30c0de53e254 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -263,12 +263,12 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
- if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
return NF_DROP;
th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -418,12 +418,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
e->ip.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg4_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 78cc64eddfc1..37fb9552e858 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
return dev_match || flags & XT_RPFILTER_LOOSE;
}
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
{
- const struct rtable *rt = skb_rtable(skb);
- return rt && (rt->rt_flags & RTCF_LOCAL);
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
}
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -79,14 +79,16 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
info = par->matchinfo;
invert = info->flags & XT_RPFILTER_INVERT;
- if (rpfilter_is_local(skb))
+ if (rpfilter_is_loopback(skb, xt_in(par)))
return true ^ invert;
iph = ip_hdr(skb);
- if (ipv4_is_multicast(iph->daddr)) {
- if (ipv4_is_zeronet(iph->saddr))
- return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr))
+ return true ^ invert;
}
+
flow.flowi4_iif = LOOPBACK_IFINDEX;
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
@@ -95,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = RT_TOS(iph->tos);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
- return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index ae1a71a97132..fcfd071f4705 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -31,6 +31,13 @@
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netfilter/nf_log.h>
+static int conntrack4_net_id __read_mostly;
+static DEFINE_MUTEX(register_ipv4_hooks);
+
+struct conntrack4_net {
+ unsigned int users;
+};
+
static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -110,7 +117,7 @@ static unsigned int ipv4_helper(void *priv,
if (!help)
return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
@@ -202,47 +209,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
},
};
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-static int log_invalid_proto_min = 0;
-static int log_invalid_proto_max = 255;
-
-static struct ctl_table ip_ct_sysctl_table[] = {
- {
- .procname = "ip_conntrack_max",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ip_conntrack_count",
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ip_conntrack_buckets",
- .maxlen = sizeof(unsigned int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ip_conntrack_checksum",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ip_conntrack_log_invalid",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &log_invalid_proto_min,
- .extra2 = &log_invalid_proto_max,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
-
/* Fast function for those who don't want to parse /proc (and I don't
blame them). */
/* Reversing the socket's dst/src point of view gives us the reply
@@ -348,23 +314,42 @@ static struct nf_sockopt_ops so_getorigdst = {
.owner = THIS_MODULE,
};
-static int ipv4_init_net(struct net *net)
+static int ipv4_hooks_register(struct net *net)
{
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- struct nf_ip_net *in = &net->ct.nf_ct_proto;
- in->ctl_table = kmemdup(ip_ct_sysctl_table,
- sizeof(ip_ct_sysctl_table),
- GFP_KERNEL);
- if (!in->ctl_table)
- return -ENOMEM;
-
- in->ctl_table[0].data = &nf_conntrack_max;
- in->ctl_table[1].data = &net->ct.count;
- in->ctl_table[2].data = &nf_conntrack_htable_size;
- in->ctl_table[3].data = &net->ct.sysctl_checksum;
- in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
-#endif
- return 0;
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv4_hooks);
+
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv4_enable(net);
+ if (err) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv4_hooks);
+ return err;
+}
+
+static void ipv4_hooks_unregister(struct net *net)
+{
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+
+ mutex_lock(&register_ipv4_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ mutex_unlock(&register_ipv4_hooks);
}
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
@@ -380,10 +365,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
.nla_policy = ipv4_nla_policy,
#endif
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- .ctl_table_path = "net/ipv4/netfilter",
-#endif
- .init_net = ipv4_init_net,
+ .net_ns_get = ipv4_hooks_register,
+ .net_ns_put = ipv4_hooks_unregister,
.me = THIS_MODULE,
};
@@ -394,52 +377,50 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("ip_conntrack");
MODULE_LICENSE("GPL");
+static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
+ &nf_conntrack_l4proto_tcp4,
+ &nf_conntrack_l4proto_udp4,
+ &nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite4,
+#endif
+};
+
static int ipv4_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- pr_err("nf_conntrack_tcp4: pernet registration failed\n");
- goto out_tcp;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_udp4: pernet registration failed\n");
- goto out_udp;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_icmp4: pernet registration failed\n");
- goto out_icmp;
- }
+ ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
+ if (ret < 0)
+ return ret;
ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
if (ret < 0) {
pr_err("nf_conntrack_ipv4: pernet registration failed\n");
- goto out_ipv4;
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
- return 0;
-out_ipv4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
-out_icmp:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
-out_udp:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
-out_tcp:
return ret;
}
static void ipv4_net_exit(struct net *net)
{
nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
static struct pernet_operations ipv4_net_ops = {
.init = ipv4_net_init,
.exit = ipv4_net_exit,
+ .id = &conntrack4_net_id,
+ .size = sizeof(struct conntrack4_net),
};
static int __init nf_conntrack_l3proto_ipv4_init(void)
@@ -447,7 +428,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv4_enable();
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
@@ -461,55 +441,21 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
goto cleanup_sockopt;
}
- ret = nf_register_hooks(ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+ ret = nf_ct_l4proto_register(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
+ if (ret < 0)
goto cleanup_pernet;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
- goto cleanup_hooks;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
- goto cleanup_tcp4;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
- goto cleanup_udp4;
- }
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
- goto cleanup_icmpv4;
+ goto cleanup_l4proto;
}
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- ret = nf_conntrack_ipv4_compat_init();
- if (ret < 0)
- goto cleanup_proto;
-#endif
return ret;
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- cleanup_proto:
- nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
-#endif
- cleanup_icmpv4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- cleanup_hooks:
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+cleanup_l4proto:
+ nf_ct_l4proto_unregister(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
cleanup_pernet:
unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
@@ -520,14 +466,9 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
static void __exit nf_conntrack_l3proto_ipv4_fini(void)
{
synchronize_net();
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- nf_conntrack_ipv4_compat_fini();
-#endif
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ nf_ct_l4proto_unregister(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
deleted file mode 100644
index 63923710f325..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/percpu.h>
-#include <linux/security.h>
-#include <net/net_namespace.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_conntrack_acct.h>
-#include <linux/rculist_nulls.h>
-#include <linux/export.h>
-
-struct ct_iter_state {
- struct seq_net_private p;
- struct hlist_nulls_head *hash;
- unsigned int htable_size;
- unsigned int bucket;
-};
-
-static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
-{
- struct ct_iter_state *st = seq->private;
- struct hlist_nulls_node *n;
-
- for (st->bucket = 0;
- st->bucket < st->htable_size;
- st->bucket++) {
- n = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- if (!is_a_nulls(n))
- return n;
- }
- return NULL;
-}
-
-static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
- struct hlist_nulls_node *head)
-{
- struct ct_iter_state *st = seq->private;
-
- head = rcu_dereference(hlist_nulls_next_rcu(head));
- while (is_a_nulls(head)) {
- if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= st->htable_size)
- return NULL;
- }
- head = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- }
- return head;
-}
-
-static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_nulls_node *head = ct_get_first(seq);
-
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
-}
-
-static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- struct ct_iter_state *st = seq->private;
-
- rcu_read_lock();
-
- nf_conntrack_get_ht(&st->hash, &st->htable_size);
- return ct_get_idx(seq, *pos);
-}
-
-static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
- return ct_get_next(s, v);
-}
-
-static void ct_seq_stop(struct seq_file *s, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
-{
- int ret;
- u32 len;
- char *secctx;
-
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
- return;
-
- seq_printf(s, "secctx=%s ", secctx);
-
- security_release_secctx(secctx, len);
-}
-#else
-static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
-{
-}
-#endif
-
-static bool ct_seq_should_skip(const struct nf_conn *ct,
- const struct net *net,
- const struct nf_conntrack_tuple_hash *hash)
-{
- /* we only want to print DIR_ORIGINAL */
- if (NF_CT_DIRECTION(hash))
- return true;
-
- if (nf_ct_l3num(ct) != AF_INET)
- return true;
-
- if (!net_eq(nf_ct_net(ct), net))
- return true;
-
- return false;
-}
-
-static int ct_seq_show(struct seq_file *s, void *v)
-{
- struct nf_conntrack_tuple_hash *hash = v;
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
- const struct nf_conntrack_l3proto *l3proto;
- const struct nf_conntrack_l4proto *l4proto;
- int ret = 0;
-
- NF_CT_ASSERT(ct);
- if (ct_seq_should_skip(ct, seq_file_net(s), hash))
- return 0;
-
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
- return 0;
-
- /* check if we raced w. object reuse */
- if (!nf_ct_is_confirmed(ct) ||
- ct_seq_should_skip(ct, seq_file_net(s), hash))
- goto release;
-
- l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
- NF_CT_ASSERT(l3proto);
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
- NF_CT_ASSERT(l4proto);
-
- ret = -ENOSPC;
- seq_printf(s, "%-8s %u %ld ",
- l4proto->name, nf_ct_protonum(ct),
- timer_pending(&ct->timeout)
- ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
-
- if (l4proto->print_conntrack)
- l4proto->print_conntrack(s, ct);
-
- if (seq_has_overflowed(s))
- goto release;
-
- print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- l3proto, l4proto);
-
- if (seq_has_overflowed(s))
- goto release;
-
- if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
- goto release;
-
- if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
- seq_printf(s, "[UNREPLIED] ");
-
- print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- l3proto, l4proto);
-
- if (seq_has_overflowed(s))
- goto release;
-
- if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
- goto release;
-
- if (test_bit(IPS_ASSURED_BIT, &ct->status))
- seq_printf(s, "[ASSURED] ");
-
-#ifdef CONFIG_NF_CONNTRACK_MARK
- seq_printf(s, "mark=%u ", ct->mark);
-#endif
-
- ct_show_secctx(s, ct);
-
- seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
-
- if (seq_has_overflowed(s))
- goto release;
-
- ret = 0;
-release:
- nf_ct_put(ct);
- return ret;
-}
-
-static const struct seq_operations ct_seq_ops = {
- .start = ct_seq_start,
- .next = ct_seq_next,
- .stop = ct_seq_stop,
- .show = ct_seq_show
-};
-
-static int ct_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &ct_seq_ops,
- sizeof(struct ct_iter_state));
-}
-
-static const struct file_operations ct_file_ops = {
- .owner = THIS_MODULE,
- .open = ct_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-/* expects */
-struct ct_expect_iter_state {
- struct seq_net_private p;
- unsigned int bucket;
-};
-
-static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
-{
- struct ct_expect_iter_state *st = seq->private;
- struct hlist_node *n;
-
- for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(
- hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
- if (n)
- return n;
- }
- return NULL;
-}
-
-static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
- struct hlist_node *head)
-{
- struct ct_expect_iter_state *st = seq->private;
-
- head = rcu_dereference(hlist_next_rcu(head));
- while (head == NULL) {
- if (++st->bucket >= nf_ct_expect_hsize)
- return NULL;
- head = rcu_dereference(
- hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
- }
- return head;
-}
-
-static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_node *head = ct_expect_get_first(seq);
-
- if (head)
- while (pos && (head = ct_expect_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
-}
-
-static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- rcu_read_lock();
- return ct_expect_get_idx(seq, *pos);
-}
-
-static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- (*pos)++;
- return ct_expect_get_next(seq, v);
-}
-
-static void exp_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static int exp_seq_show(struct seq_file *s, void *v)
-{
- struct nf_conntrack_expect *exp;
- const struct hlist_node *n = v;
-
- exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
-
- if (!net_eq(nf_ct_net(exp->master), seq_file_net(s)))
- return 0;
-
- if (exp->tuple.src.l3num != AF_INET)
- return 0;
-
- if (exp->timeout.function)
- seq_printf(s, "%ld ", timer_pending(&exp->timeout)
- ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
- else
- seq_printf(s, "- ");
-
- seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
-
- print_tuple(s, &exp->tuple,
- __nf_ct_l3proto_find(exp->tuple.src.l3num),
- __nf_ct_l4proto_find(exp->tuple.src.l3num,
- exp->tuple.dst.protonum));
- seq_putc(s, '\n');
-
- return 0;
-}
-
-static const struct seq_operations exp_seq_ops = {
- .start = exp_seq_start,
- .next = exp_seq_next,
- .stop = exp_seq_stop,
- .show = exp_seq_show
-};
-
-static int exp_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &exp_seq_ops,
- sizeof(struct ct_expect_iter_state));
-}
-
-static const struct file_operations ip_exp_file_ops = {
- .owner = THIS_MODULE,
- .open = exp_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
- struct net *net = seq_file_net(seq);
- int cpu;
-
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
- if (!cpu_possible(cpu))
- continue;
- *pos = cpu+1;
- return per_cpu_ptr(net->ct.stat, cpu);
- }
-
- return NULL;
-}
-
-static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct net *net = seq_file_net(seq);
- int cpu;
-
- for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
- if (!cpu_possible(cpu))
- continue;
- *pos = cpu+1;
- return per_cpu_ptr(net->ct.stat, cpu);
- }
-
- return NULL;
-}
-
-static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static int ct_cpu_seq_show(struct seq_file *seq, void *v)
-{
- struct net *net = seq_file_net(seq);
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
- const struct ip_conntrack_stat *st = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
- return 0;
- }
-
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
- "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- nr_conntracks,
- st->searched,
- st->found,
- st->new,
- st->invalid,
- st->ignore,
- st->delete,
- st->delete_list,
- st->insert,
- st->insert_failed,
- st->drop,
- st->early_drop,
- st->error,
-
- st->expect_new,
- st->expect_create,
- st->expect_delete,
- st->search_restart
- );
- return 0;
-}
-
-static const struct seq_operations ct_cpu_seq_ops = {
- .start = ct_cpu_seq_start,
- .next = ct_cpu_seq_next,
- .stop = ct_cpu_seq_stop,
- .show = ct_cpu_seq_show,
-};
-
-static int ct_cpu_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &ct_cpu_seq_ops,
- sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ct_cpu_seq_fops = {
- .owner = THIS_MODULE,
- .open = ct_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-static int __net_init ip_conntrack_net_init(struct net *net)
-{
- struct proc_dir_entry *proc, *proc_exp, *proc_stat;
-
- proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
- if (!proc)
- goto err1;
-
- proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
- &ip_exp_file_ops);
- if (!proc_exp)
- goto err2;
-
- proc_stat = proc_create("ip_conntrack", S_IRUGO,
- net->proc_net_stat, &ct_cpu_seq_fops);
- if (!proc_stat)
- goto err3;
- return 0;
-
-err3:
- remove_proc_entry("ip_conntrack_expect", net->proc_net);
-err2:
- remove_proc_entry("ip_conntrack", net->proc_net);
-err1:
- return -ENOMEM;
-}
-
-static void __net_exit ip_conntrack_net_exit(struct net *net)
-{
- remove_proc_entry("ip_conntrack", net->proc_net_stat);
- remove_proc_entry("ip_conntrack_expect", net->proc_net);
- remove_proc_entry("ip_conntrack", net->proc_net);
-}
-
-static struct pernet_operations ip_conntrack_net_ops = {
- .init = ip_conntrack_net_init,
- .exit = ip_conntrack_net_exit,
-};
-
-int __init nf_conntrack_ipv4_compat_init(void)
-{
- return register_pernet_subsys(&ip_conntrack_net_ops);
-}
-
-void __exit nf_conntrack_ipv4_compat_fini(void)
-{
- unregister_pernet_subsys(&ip_conntrack_net_ops);
-}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index c567e1b5d799..d075b3cf2400 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -149,7 +149,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return -NF_ACCEPT;
}
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
@@ -327,17 +327,6 @@ static struct ctl_table icmp_sysctl_table[] = {
},
{ }
};
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table icmp_compat_sysctl_table[] = {
- {
- .procname = "ip_conntrack_icmp_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -355,40 +344,14 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
- struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
- sizeof(icmp_compat_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_compat_table)
- return -ENOMEM;
-
- pn->ctl_compat_table[0].data = &in->timeout;
-#endif
-#endif
- return 0;
-}
-
static int icmp_init_net(struct net *net, u_int16_t proto)
{
- int ret;
struct nf_icmp_net *in = icmp_pernet(net);
struct nf_proto_net *pn = &in->pn;
in->timeout = nf_ct_icmp_timeout;
- ret = icmp_kmemdup_compat_sysctl_table(pn, in);
- if (ret < 0)
- return ret;
-
- ret = icmp_kmemdup_sysctl_table(pn, in);
- if (ret < 0)
- nf_ct_kfree_compat_sysctl_table(pn);
-
- return ret;
+ return icmp_kmemdup_sysctl_table(pn, in);
}
static struct nf_proto_net *icmp_get_net_proto(struct net *net)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index d88da36b383c..49bd6a54404f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -11,6 +11,7 @@
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <net/netns/generic.h>
#include <net/route.h>
#include <net/ip.h>
@@ -22,6 +23,8 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
+static DEFINE_MUTEX(defrag4_mutex);
+
static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
u_int32_t user)
{
@@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
},
};
+static void __net_exit defrag4_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv4) {
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ net->nf.defrag_ipv4 = false;
+ }
+}
+
+static struct pernet_operations defrag4_net_ops = {
+ .exit = defrag4_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
- return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ return register_pernet_subsys(&defrag4_net_ops);
}
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ unregister_pernet_subsys(&defrag4_net_ops);
}
-void nf_defrag_ipv4_enable(void)
+int nf_defrag_ipv4_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv4)
+ return 0;
+
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv4 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag4_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ceb187308120..cf986e1c7bbd 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -74,21 +74,19 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
nf_conntrack_get(skb->nfct);
#endif
/*
- * If we are in PREROUTING/INPUT, the checksum must be recalculated
- * since the length could have changed as a result of defragmentation.
- *
- * We also decrease the TTL to mitigate potential loops between two
- * hosts.
+ * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
+ * loops between two hosts.
*
* Set %IP_DF so that the original source is notified of a potentially
* decreased MTU on the clone route. IPv6 does this too.
+ *
+ * IP header checksum will be recalculated at ip_local_out.
*/
iph = ip_hdr(skb);
iph->frag_off |= htons(IP_DF);
if (hooknum == NF_INET_PRE_ROUTING ||
hooknum == NF_INET_LOCAL_IN)
--iph->ttl;
- ip_send_check(iph);
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index e7ad950cf9ef..b24795e2ee6d 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -62,7 +62,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
/* If it's for Ethernet and the lengths are OK, then log the ARP
* payload.
*/
- if (ah->ar_hrd != htons(1) ||
+ if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
ah->ar_hln != ETH_ALEN ||
ah->ar_pln != sizeof(__be32))
return;
@@ -111,8 +111,7 @@ static struct nf_logger nf_arp_logger __read_mostly = {
static int __net_init nf_log_arp_net_init(struct net *net)
{
- nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
- return 0;
+ return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
}
static void __net_exit nf_log_arp_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 076aadda0473..856648966f4c 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
else
- logflags = NF_LOG_MASK;
+ logflags = NF_LOG_DEFAULT_MASK;
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
@@ -76,7 +76,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
if (ntohs(ih->frag_off) & IP_OFFSET)
nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
- if ((logflags & XT_LOG_IPOPT) &&
+ if ((logflags & NF_LOG_IPOPT) &&
ih->ihl * 4 > sizeof(struct iphdr)) {
const unsigned char *op;
unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
@@ -250,7 +250,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
}
/* Max length: 15 "UID=4294967295 " */
- if ((logflags & XT_LOG_UID) && !iphoff)
+ if ((logflags & NF_LOG_UID) && !iphoff)
nf_log_dump_sk_uid_gid(m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -282,7 +282,7 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
- if (!(logflags & XT_LOG_MACDECODE))
+ if (!(logflags & NF_LOG_MACDECODE))
goto fallback;
switch (dev->type) {
@@ -347,8 +347,7 @@ static struct nf_logger nf_ip_logger __read_mostly = {
static int __net_init nf_log_ipv4_net_init(struct net *net)
{
- nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
- return 0;
+ return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
}
static void __net_exit nf_log_ipv4_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 9414923f1e15..edf05002d674 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
- const struct gre_hdr *greh;
- struct gre_hdr_pptp *pgreh;
+ const struct gre_base_hdr *greh;
+ struct pptp_gre_header *pgreh;
/* pgreh includes two optional 32bit fields which are not required
* to be there. That's where the magic '8' comes from */
@@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb,
return false;
greh = (void *)skb->data + hdroff;
- pgreh = (struct gre_hdr_pptp *)greh;
+ pgreh = (struct pptp_gre_header *)greh;
/* we only have destination manip of a packet, since 'source key'
* is not present in the packet itself */
if (maniptype != NF_NAT_MANIP_DST)
return true;
- switch (greh->version) {
- case GRE_VERSION_1701:
+
+ switch (greh->flags & GRE_VERSION) {
+ case GRE_VERSION_0:
/* We do not currently NAT any GREv0 packets.
* Try to behave like "nf_nat_proto_unknown" */
break;
- case GRE_VERSION_PPTP:
+ case GRE_VERSION_1:
pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
pgreh->call_id = tuple->dst.u.gre.key;
break;
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fd8220213afc..146d86105183 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
new file mode 100644
index 000000000000..a83d558e1aae
--- /dev/null
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
+ __be32 *raddr, __be32 *laddr,
+ __be16 *rport, __be16 *lport)
+{
+ unsigned int outside_hdrlen = ip_hdrlen(skb);
+ struct iphdr *inside_iph, _inside_iph;
+ struct icmphdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_REDIRECT:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return 1;
+ }
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr),
+ sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+
+ if (inside_iph->protocol != IPPROTO_TCP &&
+ inside_iph->protocol != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr) +
+ (inside_iph->ihl << 2),
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_iph->protocol;
+ *laddr = inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be32 uninitialized_var(daddr), uninitialized_var(saddr);
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ u8 uninitialized_var(protocol);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
+ int doff = 0;
+
+ if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ protocol = iph->protocol;
+ saddr = iph->saddr;
+ sport = hp->source;
+ daddr = iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = iph->protocol == IPPROTO_TCP ?
+ ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+ ip_hdrlen(skb) + sizeof(*hp);
+
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+ &sport, &dport))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) &&
+ ((iph->protocol != IPPROTO_ICMP &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (iph->protocol == IPPROTO_ICMP &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+
+ daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+ dport = (iph->protocol == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
+ return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+ daddr, sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index cd84d4295a20..805c8ddfe860 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv,
{
struct nft_pktinfo pkt;
- nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb, state);
return nft_do_chain(&pkt, priv);
}
@@ -80,7 +80,10 @@ static int __init nf_tables_arp_init(void)
{
int ret;
- nft_register_chain_type(&filter_arp);
+ ret = nft_register_chain_type(&filter_arp);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_arp_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_arp);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index e44ba3b12fbb..2840a29b2e04 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -103,7 +103,10 @@ static int __init nf_tables_ipv4_init(void)
{
int ret;
- nft_register_chain_type(&filter_ipv4);
+ ret = nft_register_chain_type(&filter_ipv4);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_ipv4);
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index bf855e64fc45..0af3d8df70dd 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -28,9 +28,9 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr,
struct in_addr gw = {
.s_addr = (__force __be32)regs->data[priv->sreg_addr],
};
- int oif = regs->data[priv->sreg_dev];
+ int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
- nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif);
+ nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif);
}
static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
@@ -59,7 +59,9 @@ static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
- if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr))
+ goto nla_put_failure;
+ if (priv->sreg_dev &&
nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
goto nla_put_failure;
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
new file mode 100644
index 000000000000..2981291910dd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -0,0 +1,236 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+#define DSCP_BITS 0xfc
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dst = &regs->data[priv->dreg];
+ const struct net_device *dev = NULL;
+ const struct iphdr *iph;
+ __be32 addr;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ iph = ip_hdr(pkt->skb);
+ if (priv->flags & NFTA_FIB_F_DADDR)
+ addr = iph->daddr;
+ else
+ addr = iph->saddr;
+
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
+
+static int get_ifindex(const struct net_device *dev)
+{
+ return dev ? dev->ifindex : 0;
+}
+
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct iphdr *iph;
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+ const struct net_device *oif;
+ struct net_device *found;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int i;
+#endif
+
+ /*
+ * Do not set flowi4_oif, it restricts results (for example, asking
+ * for oif 3 will get RTN_UNICAST result even if the daddr exits
+ * on another interface.
+ *
+ * Search results for the desired outinterface instead.
+ */
+ if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+ else if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else
+ oif = NULL;
+
+ if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+ nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ nft_in(pkt)->ifindex);
+ return;
+ }
+
+ iph = ip_hdr(pkt->skb);
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr)) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ get_ifindex(pkt->skb->dev));
+ return;
+ }
+ }
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl4.flowi4_mark = pkt->skb->mark;
+
+ fl4.flowi4_tos = iph->tos & DSCP_BITS;
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl4.daddr = iph->daddr;
+ fl4.saddr = get_saddr(iph->saddr);
+ } else {
+ fl4.daddr = iph->saddr;
+ fl4.saddr = get_saddr(iph->daddr);
+ }
+
+ *dest = 0;
+
+ if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+ return;
+
+ switch (res.type) {
+ case RTN_UNICAST:
+ break;
+ case RTN_LOCAL: /* Should not see RTN_LOCAL here */
+ return;
+ default:
+ break;
+ }
+
+ if (!oif) {
+ found = FIB_RES_DEV(res);
+ goto ok;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (i = 0; i < res.fi->fib_nhs; i++) {
+ struct fib_nh *nh = &res.fi->fib_nh[i];
+
+ if (nh->nh_dev == oif) {
+ found = nh->nh_dev;
+ goto ok;
+ }
+ }
+ return;
+#else
+ found = FIB_RES_DEV(res);
+ if (found != oif)
+ return;
+#endif
+ok:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ *dest = found->ifindex;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ strncpy((char *)dest, found->name, IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval);
+
+static struct nft_expr_type nft_fib4_type;
+
+static const struct nft_expr_ops nft_fib4_type_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops nft_fib4_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops *
+nft_fib4_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib4_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib4_type __read_mostly = {
+ .name = "fib",
+ .select_ops = &nft_fib4_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib4_module_init(void)
+{
+ return nft_register_expr(&nft_fib4_type);
+}
+
+static void __exit nft_fib4_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib4_type);
+}
+
+module_init(nft_fib4_module_init);
+module_exit(nft_fib4_module_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 51ced81b616c..a0ea8aad1bf1 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -31,8 +31,14 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
range.max_proto.all =
*(__be16 *)&regs->data[priv->sreg_proto_max];
}
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
- &range, pkt->out);
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
+ &range, nft_out(pkt));
+}
+
+static void
+nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
}
static struct nft_expr_type nft_masq_ipv4_type;
@@ -41,6 +47,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv4_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init);
module_exit(nft_masq_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index c09d4381427e..1650ed23c15d 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -35,8 +35,13 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
mr.range[0].flags |= priv->flags;
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
- pkt->hook);
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+}
+
+static void
+nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
}
static struct nft_expr_type nft_redir_ipv4_type;
@@ -45,6 +50,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv4_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -72,5 +78,5 @@ module_init(nft_redir_ipv4_module_init);
module_exit(nft_redir_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index 2c2553b9026c..517ce93699de 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,10 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
+ nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 66ddcb60519a..68d77b1f1495 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -258,7 +258,7 @@ int ping_init_sock(struct sock *sk)
struct net *net = sock_net(sk);
kgid_t group = current_egid();
struct group_info *group_info;
- int i, j, count;
+ int i;
kgid_t low, high;
int ret = 0;
@@ -270,16 +270,11 @@ int ping_init_sock(struct sock *sk)
return 0;
group_info = get_current_groups();
- count = group_info->ngroups;
- for (i = 0; i < group_info->nblocks; i++) {
- int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
- for (j = 0; j < cp_count; j++) {
- kgid_t gid = group_info->blocks[i][j];
- if (gid_lte(low, gid) && gid_lte(gid, high))
- goto out_release_group;
- }
+ for (i = 0; i < group_info->ngroups; i++) {
+ kgid_t gid = group_info->gid[i];
- count -= cp_count;
+ if (gid_lte(low, gid) && gid_lte(gid, high))
+ goto out_release_group;
}
ret = -EACCES;
@@ -614,15 +609,15 @@ int ping_getfrag(void *from, char *to,
fraglen -= sizeof(struct icmphdr);
if (fraglen < 0)
BUG();
- if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+ if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr),
fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter) != fraglen)
+ &pfh->msg->msg_iter))
return -EFAULT;
} else if (offset < sizeof(struct icmphdr)) {
BUG();
} else {
- if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter) != fraglen)
+ if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter))
return -EFAULT;
}
@@ -647,6 +642,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
{
struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
+ return 0;
pfh->wcheck = csum_partial((char *)&pfh->icmph,
sizeof(struct icmphdr), pfh->wcheck);
pfh->icmph.checksum = csum_fold(pfh->wcheck);
@@ -662,6 +659,10 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
if (len > 0xFFFF)
return -EMSGSIZE;
+ /* Must have at least a full ICMP header. */
+ if (len < icmph_len)
+ return -EINVAL;
+
/*
* Check the flags.
*/
@@ -794,7 +795,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
@@ -999,7 +1001,7 @@ struct proto ping_prot = {
.init = ping_init_sock,
.close = ping_close,
.connect = ip4_datagram_connect,
- .disconnect = udp_disconnect,
+ .disconnect = __udp_disconnect,
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.sendmsg = ping_v4_sendmsg,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 9f665b63a927..7143ca1a6af9 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -46,6 +46,8 @@
#include <net/sock.h>
#include <net/raw.h>
+#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
+
/*
* Report socket allocation statistics [mea@utu.fi]
*/
@@ -257,6 +259,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+ SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE),
SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
@@ -355,22 +358,22 @@ static void icmp_put(struct seq_file *seq)
atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
seq_puts(seq, " OutMsgs OutErrors");
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu %lu",
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + icmpmibmap[i].index));
seq_printf(seq, " %lu %lu",
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
}
@@ -378,14 +381,16 @@ static void icmp_put(struct seq_file *seq)
/*
* Called from the PROCfs module. This outputs /proc/net/snmp.
*/
-static int snmp_seq_show(struct seq_file *seq, void *v)
+static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
- int i;
struct net *net = seq->private;
+ u64 buff64[IPSTATS_MIB_MAX];
+ int i;
- seq_puts(seq, "Ip: Forwarding DefaultTTL");
+ memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
- for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_puts(seq, "Ip: Forwarding DefaultTTL");
+ for (i = 0; snmp4_ipstats_list[i].name; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
@@ -393,57 +398,77 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
net->ipv4.sysctl_ip_default_ttl);
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
- for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
- seq_printf(seq, " %llu",
- snmp_fold_field64(net->mib.ip_statistics,
- snmp4_ipstats_list[i].entry,
- offsetof(struct ipstats_mib, syncp)));
+ snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; snmp4_ipstats_list[i].name; i++)
+ seq_printf(seq, " %llu", buff64[i]);
- icmp_put(seq); /* RFC 2011 compatibility */
- icmpmsg_put(seq);
+ return 0;
+}
+
+static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
+{
+ unsigned long buff[TCPUDP_MIB_MAX];
+ struct net *net = seq->private;
+ int i;
+
+ memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+ for (i = 0; snmp4_tcp_list[i].name; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+ snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
+ net->mib.tcp_statistics);
+ for (i = 0; snmp4_tcp_list[i].name; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
- seq_printf(seq, " %ld",
- snmp_fold_field(net->mib.tcp_statistics,
- snmp4_tcp_list[i].entry));
+ seq_printf(seq, " %ld", buff[i]);
else
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.tcp_statistics,
- snmp4_tcp_list[i].entry));
+ seq_printf(seq, " %lu", buff[i]);
}
+ memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+
+ snmp_get_cpu_field_batch(buff, snmp4_udp_list,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ for (i = 0; snmp4_udp_list[i].name; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.udp_statistics,
- snmp4_udp_list[i].entry));
+ for (i = 0; snmp4_udp_list[i].name; i++)
+ seq_printf(seq, " %lu", buff[i]);
+
+ memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ snmp_get_cpu_field_batch(buff, snmp4_udp_list,
+ net->mib.udplite_statistics);
+ for (i = 0; snmp4_udp_list[i].name; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.udplite_statistics,
- snmp4_udp_list[i].entry));
+ for (i = 0; snmp4_udp_list[i].name; i++)
+ seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
return 0;
}
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+ snmp_seq_show_ipstats(seq, v);
+
+ icmp_put(seq); /* RFC 2011 compatibility */
+ icmpmsg_put(seq);
+
+ snmp_seq_show_tcp_udp(seq, v);
+
+ return 0;
+}
+
static int snmp_seq_open(struct inode *inode, struct file *file)
{
return single_open_net(inode, file, snmp_seq_show);
@@ -468,21 +493,21 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
struct net *net = seq->private;
seq_puts(seq, "TcpExt:");
- for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ for (i = 0; snmp4_net_list[i].name; i++)
seq_printf(seq, " %s", snmp4_net_list[i].name);
seq_puts(seq, "\nTcpExt:");
- for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ for (i = 0; snmp4_net_list[i].name; i++)
seq_printf(seq, " %lu",
snmp_fold_field(net->mib.net_statistics,
snmp4_net_list[i].entry));
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ for (i = 0; snmp4_ipextstats_list[i].name; i++)
seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ for (i = 0; snmp4_ipextstats_list[i].name; i++)
seq_printf(seq, " %llu",
snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipextstats_list[i].entry,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 438f50c1a676..4e49e5cb001c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -41,7 +41,7 @@
#include <linux/atomic.h>
#include <asm/byteorder.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/stddef.h>
#include <linux/slab.h>
@@ -89,9 +89,10 @@ struct raw_frag_vec {
int hlen;
};
-static struct raw_hashinfo raw_v4_hashinfo = {
+struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
};
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
int raw_hash_sk(struct sock *sk)
{
@@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk)
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);
-static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
sk_for_each_from(sk) {
@@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
found:
return sk;
}
+EXPORT_SYMBOL_GPL(__raw_v4_lookup);
/*
* 0 - deliver
@@ -604,13 +606,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0);
-
- if (!saddr && ipc.oif) {
- err = l3mdev_get_saddr(net, ipc.oif, &fl4);
- if (err < 0)
- goto done;
- }
+ daddr, saddr, 0, 0, sk->sk_uid);
if (!inet->hdrincl) {
rfv.msg = msg;
@@ -699,12 +695,20 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+
+ if (sk->sk_bound_dev_if)
+ tb_id = l3mdev_fib_table_by_index(sock_net(sk),
+ sk->sk_bound_dev_if) ? : tb_id;
+
+ chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
+ tb_id);
+
ret = -EADDRNOTAVAIL;
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
@@ -918,13 +922,27 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg
}
#endif
+int raw_abort(struct sock *sk, int err)
+{
+ lock_sock(sk);
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ __udp_disconnect(sk, 0);
+
+ release_sock(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_abort);
+
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
.close = raw_close,
.destroy = raw_destroy,
.connect = ip4_datagram_connect,
- .disconnect = udp_disconnect,
+ .disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
.init = raw_init,
.setsockopt = raw_setsockopt,
@@ -943,6 +961,7 @@ struct proto raw_prot = {
.compat_getsockopt = compat_raw_getsockopt,
.compat_ioctl = compat_raw_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
new file mode 100644
index 000000000000..e1a51ca68d23
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,266 @@
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+ if (r->sdiag_family == AF_INET) {
+ return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (r->sdiag_family == AF_INET6) {
+ return &raw_v6_hashinfo;
+#endif
+ } else {
+ pr_warn_once("Unexpected inet family %d\n",
+ r->sdiag_family);
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EINVAL);
+ }
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static struct sock *raw_lookup(struct net *net, struct sock *from,
+ const struct inet_diag_req_v2 *req)
+{
+ struct inet_diag_req_raw *r = (void *)req;
+ struct sock *sk = NULL;
+
+ if (r->sdiag_family == AF_INET)
+ sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
+ r->id.idiag_dst[0],
+ r->id.idiag_src[0],
+ r->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
+ (const struct in6_addr *)r->id.idiag_src,
+ (const struct in6_addr *)r->id.idiag_dst,
+ r->id.idiag_if);
+#endif
+ return sk;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct sock *sk = NULL, *s;
+ int slot;
+
+ if (IS_ERR(hashinfo))
+ return ERR_CAST(hashinfo);
+
+ read_lock(&hashinfo->lock);
+ for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+ sk_for_each(s, &hashinfo->ht[slot]) {
+ sk = raw_lookup(net, s, r);
+ if (sk) {
+ /*
+ * Grab it and keep until we fill
+ * diag meaage to be reported, so
+ * caller should call sock_put then.
+ * We can do that because we're keeping
+ * hashinfo->lock here.
+ */
+ sock_hold(sk);
+ goto out_unlock;
+ }
+ }
+ }
+out_unlock:
+ read_unlock(&hashinfo->lock);
+
+ return sk ? sk : ERR_PTR(-ENOENT);
+}
+
+static int raw_diag_dump_one(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
+ if (!rep) {
+ sock_put(sk);
+ return -ENOMEM;
+ }
+
+ err = inet_sk_diag_fill(sk, NULL, rep, r,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ sock_put(sk);
+
+ if (err < 0) {
+ kfree_skb(rep);
+ return err;
+ }
+
+ err = netlink_unicast(net->diag_nlsk, rep,
+ NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+ return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc, bool net_admin)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh, net_admin);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
+ struct sock *sk = NULL;
+
+ if (IS_ERR(hashinfo))
+ return;
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ read_lock(&hashinfo->lock);
+ for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+ num = 0;
+
+ sk_for_each(sk, &hashinfo->ht[slot]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ goto out_unlock;
+next:
+ num++;
+ }
+ }
+
+out_unlock:
+ read_unlock(&hashinfo->lock);
+
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int raw_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+ err = sock_diag_destroy(sk, ECONNABORTED);
+ sock_put(sk);
+ return err;
+}
+#endif
+
+static const struct inet_diag_handler raw_diag_handler = {
+ .dump = raw_diag_dump,
+ .dump_one = raw_diag_dump_one,
+ .idiag_get_info = raw_diag_get_info,
+ .idiag_type = IPPROTO_RAW,
+ .idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = raw_diag_destroy,
+#endif
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+ /*
+ * Make sure the two structures are identical,
+ * except the @pad field.
+ */
+#define __offset_mismatch(m1, m2) \
+ (offsetof(struct inet_diag_req_v2, m1) != \
+ offsetof(struct inet_diag_req_raw, m2))
+
+ BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+ sizeof(struct inet_diag_req_raw));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+ BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+ BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+ return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+ inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 62c3ed0b7556..709ffe67d1de 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -65,7 +65,7 @@
#define pr_fmt(fmt) "IPv4: " fmt
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -507,7 +507,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
u8 prot, u32 mark, int flow_flags)
@@ -523,19 +524,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
flowi4_init_output(fl4, oif, mark, tos,
RT_SCOPE_UNIVERSE, prot,
flow_flags,
- iph->daddr, iph->saddr, 0, 0);
+ iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -552,7 +555,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0);
+ daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
}
@@ -753,7 +756,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
goto reject_redirect;
}
- n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
+ if (!n)
+ n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
if (!IS_ERR(n)) {
if (!(n->nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
@@ -793,6 +798,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
struct rtable *rt;
struct flowi4 fl4;
const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
@@ -800,7 +806,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
- __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1018,7 +1024,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
if (!mark)
mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1034,7 +1040,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
if (!fl4.flowi4_mark)
fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1053,6 +1059,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
struct dst_entry *odst = NULL;
bool new = false;
+ struct net *net = sock_net(sk);
bh_lock_sock(sk);
@@ -1066,7 +1073,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
goto out;
}
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1106,7 +1113,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1121,9 +1128,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ struct net *net = sock_net(sk);
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = __ip_route_output_key(sock_net(sk), &fl4);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
@@ -1252,7 +1260,9 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = 576;
}
- return min_t(unsigned int, mtu, IP_MAX_MTU);
+ mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
@@ -1594,6 +1604,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_unlock_bh(&fnhe_lock);
}
+static void set_lwt_redirect(struct rtable *rth)
+{
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1683,14 +1706,7 @@ rt_cache:
rth->dst.input = ip_forward;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
- if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_output = rth->dst.output;
- rth->dst.output = lwtunnel_output;
- }
- if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_input = rth->dst.input;
- rth->dst.input = lwtunnel_input;
- }
+ set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1835,7 +1851,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
+ fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -1898,7 +1914,8 @@ local_input:
}
}
- rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
+ rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
+ flags | RTCF_LOCAL, res.type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
@@ -1917,8 +1934,18 @@ local_input:
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
+
if (do_cache) {
- if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ WARN_ON(rth->dst.input == lwtunnel_input);
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+
+ if (unlikely(!rt_cache_route(nh, rth))) {
rth->dst.flags |= DST_NOCACHE;
rt_add_uncached_list(rth);
}
@@ -1978,25 +2005,35 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ int our = 0;
- if (in_dev) {
- int our = ip_check_mc_rcu(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
- if (our
+ if (in_dev)
+ our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+
+ /* check l3 master if no match yet */
+ if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
+ struct in_device *l3_in_dev;
+
+ l3_in_dev = __in_dev_get_rcu(skb->dev);
+ if (l3_in_dev)
+ our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+ }
+
+ res = -EINVAL;
+ if (our
#ifdef CONFIG_IP_MROUTE
- ||
- (!ipv4_is_local_multicast(daddr) &&
- IN_DEV_MFORWARD(in_dev))
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
- ) {
- int res = ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
- rcu_read_unlock();
- return res;
- }
+ ) {
+ res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
}
rcu_read_unlock();
- return -EINVAL;
+ return res;
}
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
rcu_read_unlock();
@@ -2022,7 +2059,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
return ERR_PTR(-EINVAL);
if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl4->saddr) &&
+ !(dev_out->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev_out))
return ERR_PTR(-EINVAL);
if (ipv4_is_lbcast(fl4->daddr))
@@ -2134,8 +2173,7 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- if (lwtunnel_output_redirect(rth->dst.lwtstate))
- rth->dst.output = lwtunnel_output;
+ set_lwt_redirect(rth);
return rth;
}
@@ -2152,7 +2190,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
- int master_idx;
int orig_oif;
int err = -ENETUNREACH;
@@ -2162,9 +2199,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
orig_oif = fl4->flowi4_oif;
- master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
- if (master_idx)
- fl4->flowi4_oif = master_idx;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2248,10 +2282,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
-
- rth = l3mdev_get_rtable(dev_out, fl4);
- if (rth)
- goto out;
}
if (!fl4->daddr) {
@@ -2270,7 +2300,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
res.fi = NULL;
res.table = NULL;
if (fl4->flowi4_oif &&
- !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+ (ipv4_is_multicast(fl4->daddr) ||
+ !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2306,7 +2337,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
else
fl4->saddr = fl4->daddr;
}
- dev_out = net->loopback_dev;
+
+ /* L3 master device is the loopback for that domain */
+ dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2439,7 +2472,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
r->rtm_tos = fl4->flowi4_tos;
- r->rtm_table = table_id;
+ r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
r->rtm_type = rt->rt_type;
@@ -2495,6 +2528,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+ goto nla_put_failure;
+
error = rt->dst.error;
if (rt_is_input_route(rt)) {
@@ -2547,6 +2585,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int mark;
struct sk_buff *skb;
u32 table_id = RT_TABLE_MAIN;
+ kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2574,6 +2613,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2581,9 +2624,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_tos = rtm->rtm_tos;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
-
- if (netif_index_is_l3_master(net, fl4.flowi4_oif))
- fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+ fl4.flowi4_uid = uid;
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e3c4043c27de..3e88467d70ee 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq = tcp_rsk(req);
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
@@ -372,7 +373,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1cb67de106fe..b2fa498b15d1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -96,11 +96,11 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low
container_of(table->data, struct net, ipv4.ping_group_range.range);
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
}
/* Update system visible IP port range */
@@ -109,10 +109,10 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
kgid_t *data = table->data;
struct net *net =
container_of(table->data, struct net, ipv4.ping_group_range.range);
- write_seqlock(&net->ipv4.ip_local_ports.lock);
+ write_seqlock(&net->ipv4.ping_group_range.lock);
data[0] = low;
data[1] = high;
- write_sequnlock(&net->ipv4.ip_local_ports.lock);
+ write_sequnlock(&net->ipv4.ping_group_range.lock);
}
/* Validate changes from /proc interface. */
@@ -433,13 +433,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &tcp_adv_win_scale_max,
},
{
- .procname = "tcp_tw_reuse",
- .data = &sysctl_tcp_tw_reuse,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_frto",
.data = &sysctl_tcp_frto,
.maxlen = sizeof(int),
@@ -958,7 +951,14 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "tcp_tw_reuse",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
},
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ffbb218de520..0efb4c7f6704 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,9 +277,8 @@
#include <net/ip.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
-#include <asm/unaligned.h>
#include <net/busy_poll.h>
int sysctl_tcp_min_tso_segs __read_mostly = 2;
@@ -380,14 +379,14 @@ void tcp_init_sock(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- __skb_queue_head_init(&tp->out_of_order_queue);
+ tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- tp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -396,13 +395,15 @@ void tcp_init_sock(struct sock *sk)
*/
tp->snd_cwnd = TCP_INIT_CWND;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
- u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
@@ -421,8 +422,6 @@ void tcp_init_sock(struct sock *sk)
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
local_bh_disable();
- if (mem_cgroup_sockets_enabled)
- sock_update_memcg(sk);
sk_sockets_allocated_inc(sk);
local_bh_enable();
}
@@ -664,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
@@ -688,8 +687,7 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
int ret;
ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
- min(rd_desc->count, len), tss->flags,
- skb_socket_splice);
+ min(rd_desc->count, len), tss->flags);
if (ret > 0)
rd_desc->count -= ret;
return ret;
@@ -772,6 +770,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ret = -EAGAIN;
break;
}
+ /* if __tcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
sk_wait_data(sk, &timeo, NULL);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
@@ -998,8 +1002,11 @@ do_error:
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
return sk_stream_error(sk, flags, err);
}
@@ -1014,23 +1021,40 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
+
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
}
EXPORT_SYMBOL(tcp_sendpage);
-static inline int select_size(const struct sock *sk, bool sg)
+/* Do not bother using a page frag for very small frames.
+ * But use this heuristic only for the first skb in write queue.
+ *
+ * Having no payload in skb->head allows better SACK shifting
+ * in tcp_shift_skb_data(), reducing sack/rack overhead, because
+ * write queue has less skbs.
+ * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
+ * This also speeds up tso_fragment(), since it wont fallback
+ * to tcp_fragment().
+ */
+static int linear_payload_sz(bool first_skb)
+{
+ if (first_skb)
+ return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ return 0;
+}
+
+static int select_size(const struct sock *sk, bool sg, bool first_skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sg) {
if (sk_can_gso(sk)) {
- /* Small frames wont use a full page:
- * Payload will immediately follow tcp header.
- */
- tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ tmp = linear_payload_sz(first_skb);
} else {
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
@@ -1101,6 +1125,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
@@ -1145,7 +1171,7 @@ restart:
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto out_err;
+ goto do_error;
sg = !!(sk->sk_route_caps & NETIF_F_SG);
@@ -1161,6 +1187,8 @@ restart:
}
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+ bool first_skb;
+
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
@@ -1172,10 +1200,11 @@ new_segment:
process_backlog = false;
goto restart;
}
+ first_skb = skb_queue_empty(&sk->sk_write_queue);
skb = sk_stream_alloc_skb(sk,
- select_size(sk, sg),
+ select_size(sk, sg, first_skb),
sk->sk_allocation,
- skb_queue_empty(&sk->sk_write_queue));
+ first_skb);
if (!skb)
goto wait_for_memory;
@@ -1219,7 +1248,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i == sysctl_max_skb_frags || !sg) {
+ if (i >= sysctl_max_skb_frags || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1311,8 +1340,11 @@ do_error:
out_err:
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
release_sock(sk);
return err;
}
@@ -1570,6 +1602,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_peek_len(struct socket *sock)
+{
+ return tcp_inq(sock->sk);
+}
+EXPORT_SYMBOL(tcp_peek_len);
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -2237,7 +2275,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk);
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0;
@@ -2274,7 +2312,7 @@ EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
- ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+ (sk->sk_state != TCP_LISTEN);
}
static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
@@ -2676,15 +2714,33 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
+static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
+ struct tcp_info *info)
+{
+ u64 stats[__TCP_CHRONO_MAX], total = 0;
+ enum tcp_chrono i;
+
+ for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
+ stats[i] = tp->chrono_stat[i - 1];
+ if (i == tp->chrono_type)
+ stats[i] += tcp_time_stamp - tp->chrono_start;
+ stats[i] *= USEC_PER_SEC / HZ;
+ total += stats[i];
+ }
+
+ info->tcpi_busy_time = total;
+ info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
+ info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
+}
+
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp;
- unsigned int start;
- int notsent_bytes;
+ u32 now = tcp_time_stamp, intv;
u64 rate64;
+ bool slow;
u32 rate;
memset(info, 0, sizeof(*info));
@@ -2693,6 +2749,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_state = sk_state_load(sk);
+ /* Report meaningful fields for all TCP states, including listeners */
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ info->tcpi_pacing_rate = rate64;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ info->tcpi_max_pacing_rate = rate64;
+
+ info->tcpi_reordering = tp->reordering;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+
+ if (info->tcpi_state == TCP_LISTEN) {
+ /* listeners aliased fields :
+ * tcpi_unacked -> Number of children ready for accept()
+ * tcpi_sacked -> max backlog
+ */
+ info->tcpi_unacked = sk->sk_ack_backlog;
+ info->tcpi_sacked = sk->sk_max_ack_backlog;
+ return;
+ }
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2720,13 +2797,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (info->tcpi_state == TCP_LISTEN) {
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
- } else {
- info->tcpi_unacked = tp->packets_out;
- info->tcpi_sacked = tp->sacked_out;
- }
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
@@ -2740,40 +2813,60 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
- info->tcpi_reordering = tp->reordering;
info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
- rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
- put_unaligned(rate64, &info->tcpi_pacing_rate);
+ slow = lock_sock_fast(sk);
- rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
- put_unaligned(rate64, &info->tcpi_max_pacing_rate);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+ tcp_get_info_chrono_stats(tp, info);
+
+ unlock_sock_fast(sk, slow);
- do {
- start = u64_stats_fetch_begin_irq(&tp->syncp);
- put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
- put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
- } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
- notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
- info->tcpi_notsent_bytes = max(0, notsent_bytes);
-
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
+
+ info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+ rate = READ_ONCE(tp->rate_delivered);
+ intv = READ_ONCE(tp->rate_interval_us);
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ info->tcpi_delivery_rate = rate64;
+ }
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *stats;
+ struct tcp_info info;
+
+ stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ tcp_get_info_chrono_stats(tp, &info);
+ nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+ info.tcpi_busy_time, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+ info.tcpi_rwnd_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+ info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+ return stats;
+}
+
static int do_tcp_getsockopt(struct sock *sk, int level,
int optname, char __user *optval, int __user *optlen)
{
@@ -3092,23 +3185,6 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
}
EXPORT_SYMBOL(tcp_get_md5sig_pool);
-int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
- const struct tcphdr *th)
-{
- struct scatterlist sg;
- struct tcphdr hdr;
-
- /* We are not allowed to change tcphdr, make a local copy */
- memcpy(&hdr, th, sizeof(hdr));
- hdr.check = 0;
-
- /* options aren't included in the hash */
- sg_init_one(&sg, &hdr, sizeof(hdr));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr));
- return crypto_ahash_update(hp->md5_req);
-}
-EXPORT_SYMBOL(tcp_md5_hash_header);
-
int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
const struct sk_buff *skb, unsigned int header_len)
{
@@ -3255,11 +3331,12 @@ static void __init tcp_init_mem(void)
void __init tcp_init(void)
{
- unsigned long limit;
int max_rshare, max_wshare, cnt;
+ unsigned long limit;
unsigned int i;
- sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..b89bce4c721e
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,926 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ * On each ACK, update our model of the network path:
+ * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ * min_rtt = windowed_min(rtt, 10 seconds)
+ * pacing_rate = pacing_gain * bottleneck_bandwidth
+ * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * Here is a state transition diagram for BBR:
+ *
+ * |
+ * V
+ * +---> STARTUP ----+
+ * | | |
+ * | V |
+ * | DRAIN ----+
+ * | | |
+ * | V |
+ * +---> PROBE_BW ----+
+ * | ^ | |
+ * | | | |
+ * | +----+ |
+ * | |
+ * +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
+ u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ restore_cwnd:1, /* decided to revert cwnd to old value */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ tso_segs_goal:7, /* segments we want in each skb we send */
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ unused:5,
+ lt_is_sampling:1, /* taking long-term ("LT") samples now? */
+ lt_rtt_cnt:7, /* round trips in long-term interval */
+ lt_use_bw:1; /* use lt_bw as our bw estimate? */
+ u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
+ u32 lt_last_delivered; /* LT intvl start: tp->delivered */
+ u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
+ u32 lt_last_lost; /* LT intvl start: tp->lost */
+ u32 pacing_gain:10, /* current gain for setting pacing rate */
+ cwnd_gain:10, /* current gain for setting cwnd */
+ full_bw_cnt:3, /* number of rounds without large bw gains */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ unused_b:6;
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+ rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC;
+ return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+ sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 min_segs;
+
+ min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+ 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
+ bbr->idle_restart = 1;
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
+ * need more speed (we're restarting from idle and app-limited).
+ */
+ if (bbr->mode == BBR_PROBE_BW)
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ }
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd;
+ u64 w;
+
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
+ * default. This should only happen when the connection is not using TCP
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+ * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ */
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
+ return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+
+ w = (u64)bw * bbr->min_rtt_us;
+
+ /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd += 3 * bbr->tso_segs_goal;
+
+ /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+ cwnd = (cwnd + 1) & ~1U;
+
+ return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+ struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+ u32 cwnd = tp->snd_cwnd;
+
+ /* An ACK for P pkts should release at most 2*P packets. We do this
+ * in two steps. First, here we deduct the number of lost packets.
+ * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+ */
+ if (rs->losses > 0)
+ cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+ if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+ /* Starting 1st round of Recovery, so do packet conservation. */
+ bbr->packet_conservation = 1;
+ bbr->next_rtt_delivered = tp->delivered; /* start round now */
+ /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+ cwnd = tcp_packets_in_flight(tp) + acked;
+ } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+ /* Exiting loss recovery; restore cwnd saved before recovery. */
+ bbr->restore_cwnd = 1;
+ bbr->packet_conservation = 0;
+ }
+ bbr->prev_ca_state = state;
+
+ if (bbr->restore_cwnd) {
+ /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+ cwnd = max(cwnd, bbr->prior_cwnd);
+ bbr->restore_cwnd = 0;
+ }
+
+ if (bbr->packet_conservation) {
+ *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+ return true; /* yes, using packet conservation */
+ }
+ *new_cwnd = cwnd;
+ return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ u32 acked, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd = 0, target_cwnd = 0;
+
+ if (!acked)
+ return;
+
+ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+ goto done;
+
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
+ target_cwnd = bbr_target_cwnd(sk, bw, gain);
+ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
+ cwnd = min(cwnd + acked, target_cwnd);
+ else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+ cwnd = cwnd + acked;
+ cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
+ tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool is_full_length =
+ skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+ bbr->min_rtt_us;
+ u32 inflight, bw;
+
+ /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+ * use the pipe without increasing the queue.
+ */
+ if (bbr->pacing_gain == BBR_UNIT)
+ return is_full_length; /* just use wall clock time */
+
+ inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ bw = bbr_max_bw(sk);
+
+ /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+ * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+ * small (e.g. on a LAN). We do not persist if packets are lost, since
+ * a path with small buffers may not hold that much.
+ */
+ if (bbr->pacing_gain > BBR_UNIT)
+ return is_full_length &&
+ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
+ inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+ /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+ * probing didn't find more bw. If inflight falls to match BDP then we
+ * estimate queue is drained; persisting would underutilize the pipe.
+ */
+ return is_full_length ||
+ inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+ bbr->cycle_mstamp = tp->delivered_mstamp;
+ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+ bbr_is_next_cycle_phase(sk, rs))
+ bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_STARTUP;
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_PROBE_BW;
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+ bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+ if (!bbr_full_bw_reached(sk))
+ bbr_reset_startup_mode(sk);
+ else
+ bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+ bbr->lt_last_delivered = tp->delivered;
+ bbr->lt_last_lost = tp->lost;
+ bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_bw = 0;
+ bbr->lt_use_bw = 0;
+ bbr->lt_is_sampling = false;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 diff;
+
+ if (bbr->lt_bw) { /* do we have bw from a previous interval? */
+ /* Is new bw close to the lt_bw from the previous interval? */
+ diff = abs(bw - bbr->lt_bw);
+ if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+ (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+ bbr_lt_bw_diff)) {
+ /* All criteria are met; estimate we're policed. */
+ bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
+ bbr->lt_use_bw = 1;
+ bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
+ bbr->lt_rtt_cnt = 0;
+ return;
+ }
+ }
+ bbr->lt_bw = bw;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 lost, delivered;
+ u64 bw;
+ s32 t;
+
+ if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
+ if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+ ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
+ bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
+ }
+ return;
+ }
+
+ /* Wait for the first loss before sampling, to let the policer exhaust
+ * its tokens and estimate the steady-state rate allowed by the policer.
+ * Starting samples earlier includes bursts that over-estimate the bw.
+ */
+ if (!bbr->lt_is_sampling) {
+ if (!rs->losses)
+ return;
+ bbr_reset_lt_bw_sampling_interval(sk);
+ bbr->lt_is_sampling = true;
+ }
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (rs->is_app_limited) {
+ bbr_reset_lt_bw_sampling(sk);
+ return;
+ }
+
+ if (bbr->round_start)
+ bbr->lt_rtt_cnt++; /* count round trips in this interval */
+ if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+ return; /* sampling interval needs to be longer */
+ if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* interval is too long */
+ return;
+ }
+
+ /* End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (!rs->losses)
+ return;
+
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = tp->lost - bbr->lt_last_lost;
+ delivered = tp->delivered - bbr->lt_last_delivered;
+ /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+ if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+ return;
+
+ /* Find average delivery rate in this sampling interval. */
+ t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+ if (t < 1)
+ return; /* interval is less than one jiffy, so wait */
+ t = jiffies_to_usecs(t);
+ /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+ if (t < 1) {
+ bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
+ return;
+ }
+ bw = (u64)delivered * BW_UNIT;
+ do_div(bw, t);
+ bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->round_start = 0;
+ if (rs->delivered < 0 || rs->interval_us <= 0)
+ return; /* Not a valid observation */
+
+ /* See if we've reached the next RTT */
+ if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->rtt_cnt++;
+ bbr->round_start = 1;
+ bbr->packet_conservation = 0;
+ }
+
+ bbr_lt_bw_sampling(sk, rs);
+
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
+ * ratio will be <<1 for most connections. So delivered is first scaled.
+ */
+ bw = (u64)rs->delivered * BW_UNIT;
+ do_div(bw, rs->interval_us);
+
+ /* If this sample is application-limited, it is likely to have a very
+ * low delivered count that represents application behavior rather than
+ * the available network rate. Such a sample could drag down estimated
+ * bw, causing needless slow-down. Thus, to continue to send at the
+ * last measured network rate, we filter out app-limited samples unless
+ * they describe the path bw at least as well as our bw model.
+ *
+ * So the goal during app-limited phase is to proceed with the best
+ * network rate no matter how long. We automatically leave this
+ * phase when app writes faster than the network can deliver :)
+ */
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+ /* Incorporate new sample into our max bw filter. */
+ minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+ }
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw_thresh;
+
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+ return;
+
+ bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+ if (bbr_max_bw(sk) >= bw_thresh) {
+ bbr->full_bw = bbr_max_bw(sk);
+ bbr->full_bw_cnt = 0;
+ return;
+ }
+ ++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
+ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
+ } /* fall through to check if in-flight is already small: */
+ if (bbr->mode == BBR_DRAIN &&
+ tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool filter_expired;
+
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
+ filter_expired = after(tcp_time_stamp,
+ bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+ if (rs->rtt_us >= 0 &&
+ (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ bbr->min_rtt_us = rs->rtt_us;
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ }
+
+ if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
+ bbr->probe_rtt_done_stamp = 0;
+ }
+
+ if (bbr->mode == BBR_PROBE_RTT) {
+ /* Ignore low rate samples during this mode. */
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
+ if (!bbr->probe_rtt_done_stamp &&
+ tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+ bbr->probe_rtt_done_stamp = tcp_time_stamp +
+ msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+ bbr->probe_rtt_round_done = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ } else if (bbr->probe_rtt_done_stamp) {
+ if (bbr->round_start)
+ bbr->probe_rtt_round_done = 1;
+ if (bbr->probe_rtt_round_done &&
+ after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ bbr->restore_cwnd = 1; /* snap to prior_cwnd */
+ bbr_reset_mode(sk);
+ }
+ }
+ }
+ bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+ bbr_update_bw(sk, rs);
+ bbr_update_cycle_phase(sk, rs);
+ bbr_check_full_bw_reached(sk, rs);
+ bbr_check_drain(sk, rs);
+ bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw;
+
+ bbr_update_model(sk, rs);
+
+ bw = bbr_bw(sk);
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+ bbr_set_tso_segs_goal(sk);
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->prior_cwnd = 0;
+ bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
+ bbr->rtt_cnt = 0;
+ bbr->next_rtt_delivered = 0;
+ bbr->prev_ca_state = TCP_CA_Open;
+ bbr->packet_conservation = 0;
+
+ bbr->probe_rtt_done_stamp = 0;
+ bbr->probe_rtt_round_done = 0;
+ bbr->min_rtt_us = tcp_min_rtt(tp);
+ bbr->min_rtt_stamp = tcp_time_stamp;
+
+ minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
+
+ /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+ bw = (u64)tp->snd_cwnd * BW_UNIT;
+ do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+ sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */
+ bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+ bbr->restore_cwnd = 0;
+ bbr->round_start = 0;
+ bbr->idle_restart = 0;
+ bbr->full_bw = 0;
+ bbr->full_bw_cnt = 0;
+ bbr->cycle_mstamp.v64 = 0;
+ bbr->cycle_idx = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+ return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+ bbr_save_cwnd(sk);
+ return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw = bbr_bw(sk);
+
+ bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+ memset(&info->bbr, 0, sizeof(info->bbr));
+ info->bbr.bbr_bw_lo = (u32)bw;
+ info->bbr.bbr_bw_hi = (u32)(bw >> 32);
+ info->bbr.bbr_min_rtt = bbr->min_rtt_us;
+ info->bbr.bbr_pacing_gain = bbr->pacing_gain;
+ info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
+ *attr = INET_DIAG_BBRINFO;
+ return sizeof(info->bbr);
+ }
+ return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ struct rate_sample rs = { .losses = 1 };
+
+ bbr->prev_ca_state = TCP_CA_Loss;
+ bbr->full_bw = 0;
+ bbr->round_start = 1; /* treat RTO like end of a round */
+ bbr_lt_bw_sampling(sk, &rs);
+ }
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "bbr",
+ .owner = THIS_MODULE,
+ .init = bbr_init,
+ .cong_control = bbr_main,
+ .sndbuf_expand = bbr_sndbuf_expand,
+ .undo_cwnd = bbr_undo_cwnd,
+ .cwnd_event = bbr_cwnd_event,
+ .ssthresh = bbr_ssthresh,
+ .tso_segs_goal = bbr_tso_segs_goal,
+ .get_info = bbr_get_info,
+ .set_state = bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 03725b294286..35b280361cb2 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
module_param(use_tolerance, bool, 0644);
MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
-struct minmax {
+struct cdg_minmax {
union {
struct {
s32 min;
@@ -74,10 +74,10 @@ enum cdg_state {
};
struct cdg {
- struct minmax rtt;
- struct minmax rtt_prev;
- struct minmax *gradients;
- struct minmax gsum;
+ struct cdg_minmax rtt;
+ struct cdg_minmax rtt_prev;
+ struct cdg_minmax *gradients;
+ struct cdg_minmax gsum;
bool gfilled;
u8 tail;
u8 state;
@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct minmax *gradients;
+ struct cdg_minmax *gradients;
switch (ev) {
case CA_EVENT_CWND_RESTART:
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4e72bc..79c4817abc94 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
int ret = 0;
- /* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !ca->cong_avoid) {
+ /* all algorithms must implement these */
+ if (!ca->ssthresh || !ca->undo_cwnd ||
+ !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
@@ -200,8 +201,10 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
- if (sk->sk_state != TCP_CLOSE)
+ if (sk->sk_state != TCP_CLOSE) {
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
tcp_init_congestion_control(sk);
+ }
}
/* Manage refcounts on socket close. */
@@ -441,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+u32 tcp_reno_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
+
struct tcp_congestion_ops tcp_reno = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
};
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 10d728b6804c..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -56,6 +56,7 @@ struct dctcp {
u32 next_seq;
u32 ce_state;
u32 delayed_ack_reserved;
+ u32 loss_cwnd;
};
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
@@ -96,6 +97,7 @@ static void dctcp_init(struct sock *sk)
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
ca->delayed_ack_reserved = 0;
+ ca->loss_cwnd = 0;
ca->ce_state = 0;
dctcp_reset(tp, ca);
@@ -111,9 +113,10 @@ static void dctcp_init(struct sock *sk)
static u32 dctcp_ssthresh(struct sock *sk)
{
- const struct dctcp *ca = inet_csk_ca(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
}
@@ -308,12 +311,20 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
return 0;
}
+static u32 dctcp_cwnd_undo(struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
static struct tcp_congestion_ops dctcp __read_mostly = {
.init = dctcp_init,
.in_ack_event = dctcp_update_alpha,
.cwnd_event = dctcp_cwnd_event,
.ssthresh = dctcp_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = dctcp_cwnd_undo,
.set_state = dctcp_state,
.get_info = dctcp_get_info,
.flags = TCP_CONG_NEEDS_ECN,
@@ -324,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = {
static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.get_info = dctcp_get_info,
.owner = THIS_MODULE,
.name = "dctcp-reno",
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4e777a3243f9..dd2560c83a85 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -113,7 +113,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
struct tcp_fastopen_cookie tmp;
if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
- struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ struct in6_addr *buf = &tmp.addr;
int i;
for (i = 0; i < 4; i++)
@@ -205,6 +205,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
* scaled. So correct it appropriately.
*/
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+ tp->max_window = tp->snd_wnd;
/* Activate the retrans timer so that SYNACK can be retransmitted.
* The request socket is not added to the ehash
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index db7842495a64..6d9879e93648 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,6 +94,7 @@ static const struct hstcp_aimd_val {
struct hstcp {
u32 ai;
+ u32 loss_cwnd;
};
static void hstcp_init(struct sock *sk)
@@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 hstcp_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- const struct hstcp *ca = inet_csk_ca(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
+static u32 hstcp_cwnd_undo(struct sock *sk)
+{
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
+ .undo_cwnd = hstcp_cwnd_undo,
.cong_avoid = hstcp_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 083831e359df..0f7175c3338e 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index c8e6d86be114..60352ff4f5a8 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,6 +48,7 @@ struct illinois {
u32 end_seq; /* right edge of current RTT */
u32 alpha; /* Additive increase */
u32 beta; /* Muliplicative decrease */
+ u32 loss_cwnd; /* cwnd on loss */
u16 acked; /* # packets acked by current ACK */
u8 rtt_above; /* average rtt has gone above threshold */
u8 rtt_low; /* # of rtts measurements below threshold */
@@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
+static u32 tcp_illinois_cwnd_undo(struct sock *sk)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
/* Extract info for Tcp socket info provided via netlink. */
static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
@@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
+ .undo_cwnd = tcp_illinois_cwnd_undo,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a756b8749a26..41dcbd568cbe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,6 +85,7 @@ int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
/* rfc5961 challenge ack rate limiting */
int sysctl_tcp_challenge_ack_limit = 1000;
@@ -128,6 +129,23 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+{
+ static bool __once __read_mostly;
+
+ if (!__once) {
+ struct net_device *dev;
+
+ __once = true;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
+ rcu_read_unlock();
+ }
+}
+
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
@@ -144,7 +162,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
- icsk->icsk_ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+ tcp_sk(sk)->advmss);
+ if (unlikely(icsk->icsk_ack.rcv_mss != len))
+ tcp_gro_dev_warn(sk, skb);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -289,6 +310,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
@@ -309,7 +331,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
- sndmem = 2 * nr_segs * per_mss;
+ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+ sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +922,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ * and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ * and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!(sacked & TCPCB_LOST) ||
+ ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+ tp->lost += tcp_skb_pcount(skb);
+}
+
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);
tp->lost_out += tcp_skb_pcount(skb);
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
@@ -913,6 +953,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
+ tcp_sum_lost(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1135,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct rate_sample *rate;
int flag;
};
@@ -1261,6 +1303,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1311,6 +1354,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
+ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+ TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
@@ -1540,6 +1586,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack,
tcp_skb_pcount(skb),
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1622,8 +1669,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
- if (found_dup_sack)
+ if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
+ tp->delivered++; /* A spurious retransmission is delivered */
+ }
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1890,6 +1939,7 @@ void tcp_enter_loss(struct sock *sk)
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
+ bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1973,12 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+ mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ is_reneg);
+ if (mark_lost)
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ if (mark_lost) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -2361,10 +2415,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->undo_cwnd)
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
- else
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2502,6 +2553,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (inet_csk(sk)->icsk_ca_ops->cong_control)
+ return;
+
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2878,67 +2932,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
*rexmit = REXMIT_LOST;
}
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
- const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
- struct rtt_meas *m = tcp_sk(sk)->rtt_min;
- struct rtt_meas rttm = {
- .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
- .ts = now,
- };
- u32 elapsed;
-
- /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
- if (unlikely(rttm.rtt <= m[0].rtt))
- m[0] = m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[1].rtt)
- m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[2].rtt)
- m[2] = rttm;
-
- elapsed = now - m[0].ts;
- if (unlikely(elapsed > wlen)) {
- /* Passed entire window without a new min so make 2nd choice
- * the new min & 3rd choice the new 2nd. So forth and so on.
- */
- m[0] = m[1];
- m[1] = m[2];
- m[2] = rttm;
- if (now - m[0].ts > wlen) {
- m[0] = m[1];
- m[1] = rttm;
- if (now - m[0].ts > wlen)
- m[0] = rttm;
- }
- } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
- /* Passed a quarter of the window without a new min so
- * take 2nd choice from the 2nd quarter of the window.
- */
- m[2] = m[1] = rttm;
- } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
- /* Passed half the window without a new min so take the 3rd
- * choice from the last half of the window.
- */
- m[2] = rttm;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+ minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+ rtt_us ? : jiffies_to_usecs(1));
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3101,10 +3101,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack)
+ struct tcp_sacktag_state *sack,
+ struct skb_mstamp *now)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct skb_mstamp first_ackt, last_ackt, now;
+ struct skb_mstamp first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3136,7 +3137,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
-
fully_acked = false;
} else {
/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3172,6 +3172,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
+ tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -3198,22 +3199,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->lost_skb_hint = NULL;
}
+ if (!skb)
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- skb_mstamp_get(&now);
if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
- seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
}
if (sack->first_sackt.v64) {
- sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+ sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
}
-
+ sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
@@ -3241,7 +3244,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3332,8 +3335,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
- int flag)
+ int flag, const struct rate_sample *rs)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->cong_control) {
+ icsk->icsk_ca_ops->cong_control(sk, rs);
+ return;
+ }
+
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3362,9 +3372,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp);
- u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
- u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@@ -3374,9 +3382,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp);
- u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
- u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}
@@ -3578,17 +3584,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
+ struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- u32 prior_delivered = tp->delivered;
+ u32 delivered = tp->delivered;
+ u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
+ sack_state.rate = &rs;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3611,6 +3621,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ skb_mstamp_get(&now);
+
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -3621,6 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
}
prior_fackets = tp->fackets_out;
+ rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3676,7 +3689,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ &sack_state, &now);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3693,7 +3706,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ lost = tp->lost - lost; /* freshly marked lost */
+ tcp_rate_gen(sk, delivered, lost, &now, &rs);
+ tcp_cong_control(sk, ack, delivered, flag, &rs);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -4107,7 +4123,7 @@ void tcp_fin(struct sock *sk)
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
@@ -4267,7 +4283,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_empty(&tp->out_of_order_queue)) {
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
return;
}
@@ -4343,10 +4359,13 @@ static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
+ bool fin, fragstolen, eaten;
struct sk_buff *skb, *tail;
- bool fragstolen, eaten;
+ struct rb_node *p;
- while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ p = rb_first(&tp->out_of_order_queue);
+ while (p) {
+ skb = rb_entry(p, struct sk_buff, rbnode);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
@@ -4356,9 +4375,10 @@ static void tcp_ofo_queue(struct sock *sk)
dsack_high = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &tp->out_of_order_queue);
- __skb_unlink(skb, &tp->out_of_order_queue);
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
tcp_drop(sk, skb);
continue;
@@ -4370,12 +4390,19 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- tcp_fin(sk);
- if (eaten)
+ else
kfree_skb_partial(skb, fragstolen);
+
+ if (unlikely(fin)) {
+ tcp_fin(sk);
+ /* tcp_fin() purges tp->out_of_order_queue,
+ * so we must end this loop right now.
+ */
+ break;
+ }
}
}
@@ -4391,12 +4418,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
if (tcp_prune_queue(sk) < 0)
return -1;
- if (!sk_rmem_schedule(sk, skb, size)) {
+ while (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
-
- if (!sk_rmem_schedule(sk, skb, size))
- return -1;
}
}
return 0;
@@ -4405,8 +4429,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node **p, *q, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
+ bool fragstolen;
tcp_ecn_check_ce(tp, skb);
@@ -4421,88 +4447,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ tp->rcv_nxt, seq, end_seq);
- skb1 = skb_peek_tail(&tp->out_of_order_queue);
- if (!skb1) {
+ p = &tp->out_of_order_queue.rb_node;
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
- tp->selective_acks[0].end_seq =
- TCP_SKB_CB(skb)->end_seq;
- }
- __skb_queue_head(&tp->out_of_order_queue, skb);
- goto end;
- }
-
- seq = TCP_SKB_CB(skb)->seq;
- end_seq = TCP_SKB_CB(skb)->end_seq;
-
- if (seq == TCP_SKB_CB(skb1)->end_seq) {
- bool fragstolen;
-
- if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
- } else {
- tcp_grow_window(sk, skb);
- kfree_skb_partial(skb, fragstolen);
- skb = NULL;
+ tp->selective_acks[0].start_seq = seq;
+ tp->selective_acks[0].end_seq = end_seq;
}
-
- if (!tp->rx_opt.num_sacks ||
- tp->selective_acks[0].end_seq != seq)
- goto add_sack;
-
- /* Common case: data arrive in order after hole. */
- tp->selective_acks[0].end_seq = end_seq;
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ tp->ooo_last_skb = skb;
goto end;
}
- /* Find place to insert this segment. */
- while (1) {
- if (!after(TCP_SKB_CB(skb1)->seq, seq))
- break;
- if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
- skb1 = NULL;
- break;
- }
- skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
- }
-
- /* Do skb overlap to previous one? */
- if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- /* All the bits are present. Drop. */
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
- skb = NULL;
- tcp_dsack_set(sk, seq, end_seq);
- goto add_sack;
+ /* In the typical case, we are adding an skb to the end of the list.
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ goto add_sack;
+ }
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+ parent = &tp->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
+
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+ p = &parent->rb_left;
+ continue;
}
- if (after(seq, TCP_SKB_CB(skb1)->seq)) {
- /* Partial overlap. */
- tcp_dsack_set(sk, seq,
- TCP_SKB_CB(skb1)->end_seq);
- } else {
- if (skb_queue_is_first(&tp->out_of_order_queue,
- skb1))
- skb1 = NULL;
- else
- skb1 = skb_queue_prev(
- &tp->out_of_order_queue,
- skb1);
+ if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &tp->out_of_order_queue);
+ tcp_dsack_extend(sk,
+ TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb1);
+ goto merge_right;
+ }
+ } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ goto coalesce_done;
}
+ p = &parent->rb_right;
}
- if (!skb1)
- __skb_queue_head(&tp->out_of_order_queue, skb);
- else
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
- /* And clean segments covered by new one as whole. */
- while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
- skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((q = rb_next(&skb->rbnode)) != NULL) {
+ skb1 = rb_entry(q, struct sk_buff, rbnode);
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
@@ -4511,12 +4541,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
end_seq);
break;
}
- __skb_unlink(skb1, &tp->out_of_order_queue);
+ rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!q)
+ tp->ooo_last_skb = skb;
add_sack:
if (tcp_is_sack(tp))
@@ -4653,13 +4686,13 @@ queue_and_out:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (skb_queue_empty(&tp->out_of_order_queue))
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pingpong = 0;
}
@@ -4713,48 +4746,76 @@ drop:
tcp_data_queue_ofo(sk, skb);
}
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ if (list)
+ return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+ return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
- struct sk_buff_head *list)
+ struct sk_buff_head *list,
+ struct rb_root *root)
{
- struct sk_buff *next = NULL;
+ struct sk_buff *next = tcp_skb_next(skb, list);
- if (!skb_queue_is_last(list, skb))
- next = skb_queue_next(list, skb);
+ if (list)
+ __skb_unlink(skb, list);
+ else
+ rb_erase(&skb->rbnode, root);
- __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
+}
+
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
*
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
- struct sk_buff *head, struct sk_buff *tail,
- u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
- struct sk_buff *skb, *n;
+ struct sk_buff *skb = head, *n;
+ struct sk_buff_head tmp;
bool end_of_skbs;
/* First, check that queue is collapsible and find
- * the point where collapsing can be useful. */
- skb = head;
+ * the point where collapsing can be useful.
+ */
restart:
- end_of_skbs = true;
- skb_queue_walk_from_safe(list, skb, n) {
- if (skb == tail)
- break;
+ for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+ n = tcp_skb_next(skb, list);
+
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- skb = tcp_collapse_one(sk, skb, list);
+ skb = tcp_collapse_one(sk, skb, list, root);
if (!skb)
break;
goto restart;
@@ -4772,13 +4833,10 @@ restart:
break;
}
- if (!skb_queue_is_last(list, skb)) {
- struct sk_buff *next = skb_queue_next(list, skb);
- if (next != tail &&
- TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
- end_of_skbs = false;
- break;
- }
+ if (n && n != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+ end_of_skbs = false;
+ break;
}
/* Decided to skip this, advance start seq. */
@@ -4788,17 +4846,22 @@ restart:
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
+ __skb_queue_head_init(&tmp);
+
while (before(start, end)) {
int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
- return;
+ break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_queue_before(list, skb, nskb);
+ if (list)
+ __skb_queue_before(list, skb, nskb);
+ else
+ __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
@@ -4816,14 +4879,17 @@ restart:
start += size;
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- skb = tcp_collapse_one(sk, skb, list);
+ skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
- return;
+ goto end;
}
}
}
+end:
+ skb_queue_walk_safe(&tmp, skb, n)
+ tcp_rbtree_insert(root, skb);
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4832,70 +4898,86 @@ restart:
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
- struct sk_buff *head;
+ struct sk_buff *skb, *head;
+ struct rb_node *p;
u32 start, end;
- if (!skb)
+ p = rb_first(&tp->out_of_order_queue);
+ skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+ if (!skb) {
+ p = rb_last(&tp->out_of_order_queue);
+ /* Note: This is possible p is NULL here. We do not
+ * use rb_entry_safe(), as ooo_last_skb is valid only
+ * if rbtree is not empty.
+ */
+ tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
return;
-
+ }
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
- head = skb;
-
- for (;;) {
- struct sk_buff *next = NULL;
- if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
- next = skb_queue_next(&tp->out_of_order_queue, skb);
- skb = next;
+ for (head = skb;;) {
+ skb = tcp_skb_next(skb, NULL);
- /* Segment is terminated when we see gap or when
- * we are at the end of all the queue. */
+ /* Range is terminated when we see a gap or when
+ * we are at the queue end.
+ */
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, &tp->out_of_order_queue,
+ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
head, skb, start, end);
- head = skb;
- if (!skb)
- break;
- /* Start new segment */
+ goto new_range;
+ }
+
+ if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
- } else {
- if (before(TCP_SKB_CB(skb)->seq, start))
- start = TCP_SKB_CB(skb)->seq;
- if (after(TCP_SKB_CB(skb)->end_seq, end))
- end = TCP_SKB_CB(skb)->end_seq;
- }
}
}
/*
- * Purge the out-of-order queue.
- * Return true if queue was pruned.
+ * Clean the out-of-order queue to make room.
+ * We drop high sequences packets to :
+ * 1) Let a chance for holes to be filled.
+ * 2) not add too big latencies if thousands of packets sit there.
+ * (But if application shrinks SO_RCVBUF, we could still end up
+ * freeing whole queue here)
+ *
+ * Return true if queue has shrunk.
*/
static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- bool res = false;
+ struct rb_node *node, *prev;
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
- __skb_queue_purge(&tp->out_of_order_queue);
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ return false;
- /* Reset SACK state. A conforming SACK implementation will
- * do the same at a timeout based retransmit. When a connection
- * is in a sad state like this, we care only about integrity
- * of the connection not performance.
- */
- if (tp->rx_opt.sack_ok)
- tcp_sack_reset(&tp->rx_opt);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ node = &tp->ooo_last_skb->rbnode;
+ do {
+ prev = rb_prev(node);
+ rb_erase(node, &tp->out_of_order_queue);
+ tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
sk_mem_reclaim(sk);
- res = true;
- }
- return res;
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ !tcp_under_memory_pressure(sk))
+ break;
+ node = prev;
+ } while (node);
+ tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ return true;
}
/* Reduce allocated memory if we can, trying to get
@@ -4920,7 +5002,7 @@ static int tcp_prune_queue(struct sock *sk)
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
- tcp_collapse(sk, &sk->sk_receive_queue,
+ tcp_collapse(sk, &sk->sk_receive_queue, NULL,
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
@@ -4996,10 +5078,13 @@ static void tcp_check_space(struct sock *sk)
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/* pairs with tcp_poll() */
- smp_mb__after_atomic();
+ smp_mb();
if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
}
}
@@ -5025,7 +5110,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
- (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+ (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
@@ -5926,7 +6011,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else
tcp_init_metrics(sk);
- tcp_update_pacing_rate(sk);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
@@ -6232,13 +6318,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
-
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6248,6 +6328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
tcp_rsk(req)->af_specific = af_ops;
+ tcp_rsk(req)->ts_off = 0;
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6259,6 +6340,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
+ inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
@@ -6268,6 +6350,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
+ if (isn && tmp_opt.tstamp_ok)
+ af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+
if (!want_cookie && !isn) {
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
@@ -6308,7 +6393,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_release;
}
- isn = af_ops->init_seq(skb);
+ isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
}
if (!dst) {
dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6320,6 +6405,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ tcp_rsk(req)->ts_off = 0;
req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7158d4f8dae4..fe9da4fb96bf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,9 +84,7 @@
#include <crypto/hash.h>
#include <linux/scatterlist.h>
-int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
-EXPORT_SYMBOL(sysctl_tcp_low_latency);
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
@@ -96,12 +94,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -121,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
+ (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (tp->write_seq == 0)
@@ -238,7 +236,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
- usin->sin_port);
+ usin->sin_port,
+ &tp->tsoffset);
inet->inet_id = tp->write_seq ^ jiffies;
@@ -443,7 +442,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
@@ -692,6 +691,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -712,7 +712,7 @@ out:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct net *net,
+static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
@@ -727,6 +727,7 @@ static void tcp_v4_send_ack(struct net *net,
#endif
];
} rep;
+ struct net *net = sock_net(sk);
struct ip_reply_arg arg;
memset(&rep.th, 0, sizeof(struct tcphdr));
@@ -776,6 +777,7 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -791,7 +793,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(sock_net(sk), skb,
+ tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
@@ -819,10 +821,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
- tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -1175,6 +1177,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
&iph->saddr, ntohs(th->source),
&iph->daddr, ntohs(th->dest),
@@ -1195,7 +1198,6 @@ static void tcp_v4_init_req(struct request_sock *req,
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->no_srccheck = inet_sk(sk_listener)->transparent;
ireq->opt = tcp_v4_save_options(skb);
}
@@ -1537,6 +1539,49 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_prequeue);
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
+{
+ u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
+
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64*1024;
+
+ /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
+ * we can fix skb->truesize to its real value to avoid future drops.
+ * This is valid because skb is not yet charged to the socket.
+ * It has been noticed pure SACK packets were sometimes dropped
+ * (if cooked by drivers without copybreak feature).
+ */
+ if (!skb->data_len)
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+
+ if (unlikely(sk_add_backlog(sk, skb, limit))) {
+ bh_unlock_sock(sk);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_add_backlog);
+
+int tcp_filter(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcphdr *th = (struct tcphdr *)skb->data;
+ unsigned int eaten = skb->len;
+ int err;
+
+ err = sk_filter_trim_cap(sk, skb, th->doff * 4);
+ if (!err) {
+ eaten -= skb->len;
+ TCP_SKB_CB(skb)->end_seq -= eaten;
+ }
+ return err;
+}
+EXPORT_SYMBOL(tcp_filter);
+
/*
* From tcp_input.c
*/
@@ -1608,6 +1653,7 @@ process:
sk = req->rsk_listener;
if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1648,8 +1694,10 @@ process:
nf_reset(skb);
- if (sk_filter(sk, skb))
+ if (tcp_filter(sk, skb))
goto discard_and_relse;
+ th = (const struct tcphdr *)skb->data;
+ iph = ip_hdr(skb);
skb->dev = NULL;
@@ -1666,10 +1714,7 @@ process:
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
- } else if (unlikely(sk_add_backlog(sk, skb,
- sk->sk_rcvbuf + sk->sk_sndbuf))) {
- bh_unlock_sock(sk);
- __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+ } else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
}
bh_unlock_sock(sk);
@@ -1818,7 +1863,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_write_queue_purge(sk);
/* Cleans up our, hopefully empty, out_of_order_queue. */
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
#ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */
@@ -1845,9 +1890,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
local_bh_disable();
sk_sockets_allocated_dec(sk);
local_bh_enable();
-
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- sock_release_memcg(sk);
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -1864,13 +1906,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
struct inet_listen_hashbucket *ilb;
- struct inet_connection_sock *icsk;
struct sock *sk = cur;
if (!sk) {
get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock_bh(&ilb->lock);
+ spin_lock(&ilb->lock);
sk = sk_head(&ilb->head);
st->offset = 0;
goto get_sk;
@@ -1886,9 +1927,8 @@ get_sk:
continue;
if (sk->sk_family == st->family)
return sk;
- icsk = inet_csk(sk);
}
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE)
goto get_head;
@@ -2096,7 +2136,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
@@ -2415,6 +2455,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_orphan_retries = 0;
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+ net->ipv4.sysctl_tcp_tw_reuse = 0;
return 0;
fail:
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c67ece1390c2..046fd3910873 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
static struct tcp_congestion_ops tcp_lp __read_mostly = {
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_lp_cong_avoid,
.pkts_acked = tcp_lp_pkts_acked,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b617826e2477..ba8f02d0f283 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -606,7 +606,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
return ret;
}
-EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
{
@@ -742,16 +741,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
rcu_read_unlock();
}
-static struct genl_family tcp_metrics_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = TCP_METRICS_GENL_NAME,
- .version = TCP_METRICS_GENL_VERSION,
- .maxattr = TCP_METRICS_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family tcp_metrics_nl_family;
-static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
[TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
[TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
.len = sizeof(struct in6_addr), },
@@ -1116,6 +1108,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
},
};
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tcp_metrics_nl_ops,
+ .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+};
+
static unsigned int tcpmhash_entries;
static int __init set_tcpmhash_entries(char *str)
{
@@ -1179,8 +1182,7 @@ void __init tcp_metrics_init(void)
if (ret < 0)
panic("Could not allocate the tcp_metrics hash table\n");
- ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
- tcp_metrics_nl_ops);
+ ret = genl_register_family(&tcp_metrics_nl_family);
if (ret < 0)
panic("Could not register tcp_metrics generic netlink\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..28ce5ee831f5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- newtp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -487,8 +487,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ newtp->app_limited = ~0U;
+
tcp_init_xmit_timers(newsk);
- __skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
@@ -530,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
- newtp->tsoffset = 0;
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -579,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c5964962d0c..bc68da38ea86 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
- /* GSO partial only requires splitting the frame into an MSS
- * multiple and possibly a remainder. So update the mss now.
- */
- if (features & NETIF_F_GSO_PARTIAL)
- mss = skb->len - (skb->len % mss);
-
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
delta = htonl(oldlen + (thlen + mss));
skb = segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d48d5571e62a..8ce50dc3ab8c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcp_skb_timestamp(skb);
+ opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
- TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->lost_out > tp->retrans_out &&
+ tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tcp_xmit_retransmit_queue(sk);
+
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
+ }
}
/*
* One tasklet per cpu tries to send more skbs.
@@ -762,25 +769,27 @@ static void tcp_tasklet_func(unsigned long data)
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
- bh_lock_sock(sk);
-
- if (!sock_owned_by_user(sk)) {
- tcp_tsq_handler(sk);
- } else {
- /* defer the work to tcp_release_cb() */
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ smp_mb__before_atomic();
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+ if (!sk->sk_lock.owned &&
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+ tcp_tsq_handler(sk);
+ }
+ bh_unlock_sock(sk);
}
- bh_unlock_sock(sk);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
- (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -790,18 +799,17 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = tp->tsq_flags;
+ flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & (1UL << TCP_TSQ_DEFERRED))
+ if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
@@ -815,15 +823,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
@@ -853,6 +861,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
@@ -870,16 +879,25 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
- unsigned long flags;
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
- tasklet_schedule(&tsq->tasklet);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}
@@ -918,6 +936,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
+ tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -1019,7 +1038,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1213,6 +1232,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
+ /* Update delivered info for the new segment */
+ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
@@ -1358,6 +1380,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
}
return mtu;
}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
@@ -1502,6 +1525,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
+
+ /* The following conditions together indicate the starvation
+ * is caused by insufficient sender buffer:
+ * 1) just sent some data (see tcp_write_xmit)
+ * 2) not cwnd limited (this else condition)
+ * 3) no more data to send (null tcp_send_head )
+ * 4) application is hitting buffer limit (SOCK_NOSPACE)
+ */
+ if (!tcp_send_head(sk) && sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
+ (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -1545,7 +1580,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
{
u32 bytes, segs;
@@ -1557,10 +1593,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
- segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+ segs = max_t(u32, bytes / mss_now, min_tso_segs);
return min_t(u32, segs, sk->sk_gso_max_segs);
}
+EXPORT_SYMBOL(tcp_tso_autosize);
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+ return tso_segs ? :
+ tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
@@ -1884,26 +1933,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
*/
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
- int len;
int probe_size;
int size_needed;
- int copy;
+ int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
@@ -2022,6 +2071,89 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+ unsigned int factor)
+{
+ unsigned int limit;
+
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+ limit <<= factor;
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ /* Always send the 1st or 2nd skb in write queue.
+ * No need to wait for TX completion to call us back,
+ * after softirq/tasklet schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (skb == sk->sk_write_queue.next ||
+ skb->prev == sk->sk_write_queue.next)
+ return false;
+
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ return true;
+ }
+ return false;
+}
+
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_time_stamp;
+
+ if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+ tp->chrono_start = now;
+ tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+
+ /* There are multiple conditions worthy of tracking in a
+ * chronograph, so that the highest priority enum takes
+ * precedence over the other conditions (see tcp_chrono_start).
+ * If a condition stops, we only stop chrono tracking if
+ * it's the "most interesting" or current chrono we are
+ * tracking and starts busy chrono if we have pending data.
+ */
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+ else if (type == tp->chrono_type)
+ tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2044,7 +2176,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
- bool is_cwnd_limited = false;
+ bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
@@ -2059,7 +2191,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
- max_segs = tcp_tso_autosize(sk, mss_now);
+ max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -2081,8 +2213,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ is_rwnd_limited = true;
break;
+ }
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -2108,29 +2242,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- /* TCP Small Queues :
- * Control number of packets in qdisc/devices to two packets / or ~1 ms.
- * This allows for :
- * - better RTT estimation and ACK scheduling
- * - faster recovery
- * - high rates
- * Alas, some drivers / subsystems require a fair amount
- * of queued bytes to ensure line rate.
- * One example is wifi aggregation (802.11 AMPDU)
- */
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
- limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
- if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- /* It is possible TX completion already happened
- * before we set TSQ_THROTTLED, so we must
- * test again the condition.
- */
- smp_mb__after_atomic();
- if (atomic_read(&sk->sk_wmem_alloc) > limit)
- break;
- }
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+ if (tcp_small_queue_check(sk, skb, 0))
+ break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2148,6 +2263,11 @@ repair:
break;
}
+ if (is_rwnd_limited)
+ tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+ else
+ tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2398,9 +2518,11 @@ u32 __tcp_select_window(struct sock *sk)
int full_space = min_t(int, tp->window_clamp, allowed_space);
int window;
- if (mss > full_space)
+ if (unlikely(mss > full_space)) {
mss = full_space;
-
+ if (mss <= 0)
+ return 0;
+ }
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
@@ -2476,7 +2598,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
}
/* Collapses two adjacent SKB's during retransmission. */
-static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2487,13 +2609,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+ if (next_skb_size) {
+ if (next_skb_size <= skb_availroom(skb))
+ skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+ next_skb_size);
+ else if (!skb_shift(skb, next_skb, next_skb_size))
+ return false;
+ }
tcp_highest_sack_combine(sk, next_skb, skb);
tcp_unlink_write_queue(next_skb, sk);
- skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
- next_skb_size);
-
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2522,6 +2648,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_skb_collapse_tstamp(skb, next_skb);
sk_wmem_free_skb(sk, next_skb);
+ return true;
}
/* Check if coalescing SKBs is legal. */
@@ -2529,14 +2656,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
return false;
- /* TODO: SACK collapsing could be used to remove this condition */
- if (skb_shinfo(skb)->nr_frags != 0)
- return false;
if (skb_cloned(skb))
return false;
if (skb == tcp_send_head(sk))
return false;
- /* Some heurestics for collapsing over SACK'd could be invented */
+ /* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -2574,16 +2698,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (space < 0)
break;
- /* Punt if not enough space exists in the first SKB for
- * the data in the second
- */
- if (skb->len > skb_availroom(to))
- break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
- tcp_collapse_retrans(sk, to);
+ if (!tcp_collapse_retrans(sk, to))
+ break;
}
}
@@ -2777,9 +2897,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
- max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+ max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
- __u8 sacked = TCP_SKB_CB(skb)->sacked;
+ __u8 sacked;
int segs;
if (skb == tcp_send_head(sk))
@@ -2791,6 +2911,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
if (segs <= 0)
return;
+ sacked = TCP_SKB_CB(skb)->sacked;
/* In case tcp_shift_skb_data() have aggregated large skbs,
* we need to make sure not sending too bigs TSO packets
*/
@@ -2830,6 +2951,9 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
+ if (tcp_small_queue_check(sk, skb, 1))
+ return;
+
if (tcp_retransmit_skb(sk, skb, segs))
return;
@@ -3081,7 +3205,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
#endif
/* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
@@ -3258,6 +3382,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
@@ -3422,8 +3548,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
- * We also avoid tcp_wfree() overhead (cache line miss accessing
- * tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f6c50af24a64..3d063eb37848 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -117,7 +117,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
(fwmark > 0 && skb->mark == fwmark)) &&
(full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
- spin_lock(&tcp_probe.lock);
+ spin_lock_bh(&tcp_probe.lock);
/* If log fills, just silently drop */
if (tcp_probe_avail() > 1) {
struct tcp_log *p = tcp_probe.log + tcp_probe.head;
@@ -157,7 +157,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
}
tcp_probe.lastcwnd = tp->snd_cwnd;
- spin_unlock(&tcp_probe.lock);
+ spin_unlock_bh(&tcp_probe.lock);
wake_up(&tcp_probe.wait);
}
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..9be1581a5a08
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,186 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ tp->delivered_mstamp = skb->skb_mstamp;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (!scb->tx.delivered_mstamp.v64)
+ return;
+
+ if (!rs->prior_delivered ||
+ after(scb->tx.delivered, rs->prior_delivered)) {
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = skb_mstamp_us_delta(
+ &skb->skb_mstamp,
+ &scb->tx.first_tx_mstamp);
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ struct skb_mstamp *now, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = *now;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available. */
+ if (!rs->prior_mstamp.v64) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index bf5ea9e9bbc1..f2123075ce6e 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,6 +15,10 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
+struct scalable {
+ u32 loss_cwnd;
+};
+
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ struct scalable *ca = inet_csk_ca(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
+static u32 tcp_scalable_cwnd_undo(struct sock *sk)
+{
+ const struct scalable *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
+ .undo_cwnd = tcp_scalable_cwnd_undo,
.cong_avoid = tcp_scalable_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f712b411f6ed..3705075f42c3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -192,6 +192,8 @@ static int tcp_write_timeout(struct sock *sk)
if (tp->syn_data && icsk->icsk_retransmits == 1)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ } else if (!tp->syn_data && !tp->syn_fastopen) {
+ sk_rethink_txhash(sk);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
@@ -213,6 +215,8 @@ static int tcp_write_timeout(struct sock *sk)
tcp_mtu_probing(icsk, sk);
dst_negative_advice(sk);
+ } else {
+ sk_rethink_txhash(sk);
}
retry_until = net->ipv4.sysctl_tcp_retries2;
@@ -306,7 +310,7 @@ static void tcp_delack_timer(unsigned long data)
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -588,7 +592,7 @@ static void tcp_write_timer(unsigned long data)
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 4c4bac1b5eab..218cfcc77650 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_vegas_cong_avoid,
.pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 40171e163cff..76005d4b8dfc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,6 +30,7 @@ struct veno {
u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
u32 inc; /* decide whether to increase cwnd */
u32 diff; /* calculate the diff rate */
+ u32 loss_cwnd; /* cwnd when loss occured */
};
/* There are several situations when we must "re-start" Veno:
@@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
+ veno->loss_cwnd = tp->snd_cwnd;
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
+static u32 tcp_veno_cwnd_undo(struct sock *sk)
+{
+ const struct veno *veno = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
+ .undo_cwnd = tcp_veno_cwnd_undo,
.cong_avoid = tcp_veno_cong_avoid,
.pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4b03a2e2a050..fed66dc0e0f5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = tcp_westwood_event,
.in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9c5fc973267f..e6ff99c4bd3b 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,6 +37,7 @@ struct yeah {
u32 fast_count;
u32 pkts_acked;
+ u32 loss_cwnd;
};
static void tcp_yeah_init(struct sock *sk)
@@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
+ yeah->loss_cwnd = tp->snd_cwnd;
return max_t(int, tp->snd_cwnd - reduction, 2);
}
+static u32 tcp_yeah_cwnd_undo(struct sock *sk)
+{
+ const struct yeah *yeah = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
+ .undo_cwnd = tcp_yeah_cwnd_undo,
.cong_avoid = tcp_yeah_cong_avoid,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5fdcb8d108d4..8aab7d78d25b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -79,7 +79,7 @@
#define pr_fmt(fmt) "UDP: " fmt
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
@@ -114,6 +114,7 @@
#include <net/busy_poll.h>
#include "udp_impl.h"
#include <net/sock_reuseport.h>
+#include <net/addrconf.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -579,7 +580,8 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
+ IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
@@ -1018,13 +1020,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
- faddr, saddr, dport, inet->inet_sport);
-
- if (!saddr && ipc.oif) {
- err = l3mdev_get_saddr(net, ipc.oif, fl4);
- if (err < 0)
- goto out;
- }
+ faddr, saddr, dport, inet->inet_sport,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1177,6 +1174,181 @@ out:
return ret;
}
+/* fully reclaim rmem/fwd memory allocated for skb */
+static void udp_rmem_release(struct sock *sk, int size, int partial)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int amt;
+
+ if (likely(partial)) {
+ up->forward_deficit += size;
+ size = up->forward_deficit;
+ if (size < (sk->sk_rcvbuf >> 2) &&
+ !skb_queue_empty(&sk->sk_receive_queue))
+ return;
+ } else {
+ size += up->forward_deficit;
+ }
+ up->forward_deficit = 0;
+
+ sk->sk_forward_alloc += size;
+ amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
+ sk->sk_forward_alloc -= amt;
+
+ if (amt)
+ __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+
+ atomic_sub(size, &sk->sk_rmem_alloc);
+}
+
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
+{
+ udp_rmem_release(sk, skb->dev_scratch, 1);
+}
+EXPORT_SYMBOL(udp_skb_destructor);
+
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+ spinlock_t *busy;
+
+ busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+ spin_lock(busy);
+ return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+ if (busy)
+ spin_unlock(busy);
+}
+
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+ int rmem, delta, amt, err = -ENOMEM;
+ spinlock_t *busy = NULL;
+ int size;
+
+ /* try to avoid the costly atomic add/sub pair when the receive
+ * queue is full; always allow at least a packet
+ */
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ if (rmem > sk->sk_rcvbuf)
+ goto drop;
+
+ /* Under mem pressure, it might be helpful to help udp_recvmsg()
+ * having linear skbs :
+ * - Reduce memory overhead and thus increase receive queue capacity
+ * - Less cache line misses at copyout() time
+ * - Less work at consume_skb() (less alien page frag freeing)
+ */
+ if (rmem > (sk->sk_rcvbuf >> 1)) {
+ skb_condense(skb);
+
+ busy = busylock_acquire(sk);
+ }
+ size = skb->truesize;
+ /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+ * in udp_skb_destructor()
+ */
+ skb->dev_scratch = size;
+
+ /* we drop only if the receive buf is full and the receive
+ * queue contains some other skb
+ */
+ rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
+ if (rmem > (size + sk->sk_rcvbuf))
+ goto uncharge_drop;
+
+ spin_lock(&list->lock);
+ if (size >= sk->sk_forward_alloc) {
+ amt = sk_mem_pages(size);
+ delta = amt << SK_MEM_QUANTUM_SHIFT;
+ if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
+ err = -ENOBUFS;
+ spin_unlock(&list->lock);
+ goto uncharge_drop;
+ }
+
+ sk->sk_forward_alloc += delta;
+ }
+
+ sk->sk_forward_alloc -= size;
+
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
+ */
+ sock_skb_set_dropcount(sk, skb);
+
+ __skb_queue_tail(list, skb);
+ spin_unlock(&list->lock);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ busylock_release(busy);
+ return 0;
+
+uncharge_drop:
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+
+drop:
+ atomic_inc(&sk->sk_drops);
+ busylock_release(busy);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
+
+void udp_destruct_sock(struct sock *sk)
+{
+ /* reclaim completely the forward allocated memory */
+ unsigned int total = 0;
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ total += skb->truesize;
+ kfree_skb(skb);
+ }
+ udp_rmem_release(sk, total, 0);
+
+ inet_sock_destruct(sk);
+}
+EXPORT_SYMBOL_GPL(udp_destruct_sock);
+
+int udp_init_sock(struct sock *sk)
+{
+ sk->sk_destruct = udp_destruct_sock;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_init_sock);
+
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+{
+ if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
+ bool slow = lock_sock_fast(sk);
+
+ sk_peek_offset_bwd(sk, len);
+ unlock_sock_fast(sk, slow);
+ }
+ consume_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_consume_udp);
+
/**
* first_packet_length - return length of first packet in receive queue
* @sk: socket
@@ -1186,12 +1358,11 @@ out:
*/
static int first_packet_length(struct sock *sk)
{
- struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
struct sk_buff *skb;
+ int total = 0;
int res;
- __skb_queue_head_init(&list_kill);
-
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
@@ -1201,18 +1372,13 @@ static int first_packet_length(struct sock *sk)
IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
- __skb_queue_tail(&list_kill, skb);
+ total += skb->truesize;
+ kfree_skb(skb);
}
res = skb ? skb->len : -1;
+ if (total)
+ udp_rmem_release(sk, total, 1);
spin_unlock_bh(&rcvq->lock);
-
- if (!skb_queue_empty(&list_kill)) {
- bool slow = lock_sock_fast(sk);
-
- __skb_queue_purge(&list_kill);
- sk_mem_reclaim_partial(sk);
- unlock_sock_fast(sk, slow);
- }
return res;
}
@@ -1261,15 +1427,13 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
- bool slow;
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len);
try_again:
peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
@@ -1286,7 +1450,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -1302,13 +1467,12 @@ try_again:
}
if (unlikely(err)) {
- trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
- skb_free_datagram_locked(sk, skb);
+ kfree_skb(skb);
return err;
}
@@ -1327,22 +1491,21 @@ try_again:
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off);
+ ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
if (flags & MSG_TRUNC)
err = ulen;
- __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+ skb_consume_udp(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
- slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags)) {
+ if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
- unlock_sock_fast(sk, slow);
+ kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -1350,7 +1513,7 @@ csum_copy_err:
goto try_again;
}
-int udp_disconnect(struct sock *sk, int flags)
+int __udp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
/*
@@ -1372,6 +1535,15 @@ int udp_disconnect(struct sock *sk, int flags)
sk_dst_reset(sk);
return 0;
}
+EXPORT_SYMBOL(__udp_disconnect);
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+ lock_sock(sk);
+ __udp_disconnect(sk, flags);
+ release_sock(sk);
+ return 0;
+}
EXPORT_SYMBOL(udp_disconnect);
void udp_lib_unhash(struct sock *sk)
@@ -1451,7 +1623,7 @@ static void udp_v4_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -1459,9 +1631,11 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
}
- rc = __sock_queue_rcv_skb(sk, skb);
+ rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1476,7 +1650,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
return 0;
-
}
static struct static_key udp_encap_needed __read_mostly;
@@ -1498,7 +1671,6 @@ EXPORT_SYMBOL(udp_encap_enable);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
/*
@@ -1585,25 +1757,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
udp_csum_pull_header(skb);
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
- goto drop;
- }
-
- rc = 0;
ipv4_pktinfo_prepare(sk, skb);
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
- bh_unlock_sock(sk);
- goto drop;
- }
- bh_unlock_sock(sk);
-
- return rc;
+ return __udp_queue_rcv_skb(sk, skb);
csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -1648,10 +1804,10 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
- udp_table.mask;
- hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+ udptable->mask;
+ hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udp_table.hash2[hash2];
+ hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
@@ -2192,6 +2348,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
EXPORT_SYMBOL(udp_poll);
+int udp_abort(struct sock *sk, int err)
+{
+ lock_sock(sk);
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ __udp_disconnect(sk, 0);
+
+ release_sock(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_abort);
+
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
@@ -2199,13 +2369,13 @@ struct proto udp_prot = {
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
+ .init = udp_init_sock,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
@@ -2221,7 +2391,7 @@ struct proto udp_prot = {
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
- .clear_sk = sk_prot_clear_portaddr_nulls,
+ .diag_destroy = udp_abort,
};
EXPORT_SYMBOL(udp_prot);
@@ -2494,6 +2664,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
void __init udp_init(void)
{
unsigned long limit;
+ unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -2504,4 +2675,13 @@ void __init udp_init(void)
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+ /* 16 spinlocks per cpu */
+ udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+ udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+ GFP_KERNEL);
+ if (!udp_busylocks)
+ panic("UDP: failed to alloc udp_busylocks\n");
+ for (i = 0; i < (1U << udp_busylocks_log); i++)
+ spin_lock_init(udp_busylocks + i);
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 3d5ccf4b1412..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc)
+ struct nlattr *bc, bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
return inet_sk_diag_fill(sk, NULL, skb, req,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
}
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
err = inet_sk_diag_fill(sk, NULL, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
int num, s_num, slot, s_slot;
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
@@ -165,12 +167,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
r->idiag_wqueue = sk_wmem_alloc_get(sk);
}
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int __udp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req,
+ struct udp_table *tbl)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ if (req->sdiag_family == AF_INET)
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_if, tbl, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_dst[3], req->id.idiag_dport,
+ req->id.idiag_src[3], req->id.idiag_sport,
+ req->id.idiag_if, tbl, NULL);
+
+ else
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if, tbl, NULL);
+ }
+#endif
+ else {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+
+ rcu_read_unlock();
+
+ if (!sk)
+ return -ENOENT;
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_put(sk);
+ return -ENOENT;
+ }
+
+ err = sock_diag_destroy(sk, ECONNABORTED);
+
+ sock_put(sk);
+
+ return err;
+}
+
+static int udp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ return __udp_diag_destroy(in_skb, req, &udp_table);
+}
+
+static int udplite_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ return __udp_diag_destroy(in_skb, req, &udplite_table);
+}
+
+#endif
+
static const struct inet_diag_handler udp_diag_handler = {
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
.idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = udp_diag_destroy,
+#endif
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -192,6 +270,9 @@ static const struct inet_diag_handler udplite_diag_handler = {
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
.idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = udplite_diag_destroy,
+#endif
};
static int __init udp_diag_init(void)
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 7e0fe4bdd967..feb50a16398d 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
-int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b6ff36..b2be1d9757ef 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__be16 new_protocol, bool is_ipv6)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool remcsum, need_csum, offload_csum, ufo;
+ bool remcsum, need_csum, offload_csum, ufo, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* will be using a length value equal to only one MSS sized
* segment instead of the entire frame.
*/
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
uh->len = htons(skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)uh);
@@ -293,7 +295,7 @@ unflush:
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- pp = udp_sk(sk)->gro_receive(sk, head, skb);
+ pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2eea073e27ef..59f10fe9782e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -50,17 +50,17 @@ struct proto udplite_prot = {
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
- .clear_sk = sk_prot_clear_portaddr_nulls,
};
EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 41f5b504a782..6a7ff6957535 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
int oif = 0;
if (skb_dst(skb))
- oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2343e4f2e0bf..ec1267e2bd1f 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -289,4 +289,39 @@ config IPV6_PIMSM_V2
Support for IPv6 PIM multicast routing protocol PIM-SMv2.
If unsure, say N.
+config IPV6_SEG6_LWTUNNEL
+ bool "IPv6: Segment Routing Header encapsulation support"
+ depends on IPV6
+ select LWTUNNEL
+ ---help---
+ Support for encapsulation of packets within an outer IPv6
+ header and a Segment Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
+config IPV6_SEG6_INLINE
+ bool "IPv6: direct Segment Routing Header insertion "
+ depends on IPV6_SEG6_LWTUNNEL
+ ---help---
+ Support for direct insertion of the Segment Routing Header,
+ also known as inline mode. Be aware that direct insertion of
+ extension headers (as opposed to encapsulation) may break
+ multiple mechanisms such as PMTUD or IPSec AH. Use this feature
+ only if you know exactly what you are doing.
+
+ If unsure, say N.
+
+config IPV6_SEG6_HMAC
+ bool "IPv6: Segment Routing HMAC support"
+ depends on IPV6
+ select CRYPTO_HMAC
+ select CRYPTO_SHA1
+ select CRYPTO_SHA256
+ ---help---
+ Support for HMAC signature generation and verification
+ of SR-enabled packets.
+
+ If unsure, say N.
+
endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index c174ccb340a1..a9e9fec387ce 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
- udp_offload.o
+ udp_offload.o seg6.o
ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
@@ -23,6 +23,8 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
ipv6-$(CONFIG_PROC_FS) += proc.o
ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-$(CONFIG_NETLABEL) += calipso.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
+ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2f1f5d439788..a7bcc0ab5e99 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp)
return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}
+static inline s32 rfc3315_s14_backoff_init(s32 irt)
+{
+ /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
+ u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+ do_div(tmp, 1000000);
+ return (s32)tmp;
+}
+
+static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
+{
+ /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
+ u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+ do_div(tmp, 1000000);
+ if ((s32)tmp > mrt) {
+ /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
+ tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+ do_div(tmp, 1000000);
+ }
+ return (s32)tmp;
+}
+
#ifdef CONFIG_SYSCTL
static int addrconf_sysctl_register(struct inet6_dev *idev);
static void addrconf_sysctl_unregister(struct inet6_dev *idev);
@@ -126,9 +147,8 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
}
#endif
-static void __ipv6_regen_rndid(struct inet6_dev *idev);
-static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
-static void ipv6_regen_rndid(unsigned long data);
+static void ipv6_regen_rndid(struct inet6_dev *idev);
+static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
static int ipv6_count_addresses(struct inet6_dev *idev);
@@ -163,7 +183,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
static void addrconf_dad_start(struct inet6_ifaddr *ifp);
static void addrconf_dad_work(struct work_struct *w);
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id);
static void addrconf_dad_run(struct inet6_dev *idev);
static void addrconf_rs_timer(unsigned long data);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
@@ -187,6 +207,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -217,6 +238,11 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.use_oif_addrs_only = 0,
.ignore_routes_with_linkdown = 0,
.keep_addr_on_down = 0,
+ .seg6_enabled = 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ .seg6_require_hmac = 0,
+#endif
+ .enhanced_dad = 1,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -232,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -262,6 +289,11 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.use_oif_addrs_only = 0,
.ignore_routes_with_linkdown = 0,
.keep_addr_on_down = 0,
+ .seg6_enabled = 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ .seg6_require_hmac = 0,
+#endif
+ .enhanced_dad = 1,
};
/* Check if a valid qdisc is available */
@@ -386,9 +418,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
goto err_release;
}
- /* One reference from device. We must do this before
- * we invoke __ipv6_regen_rndid().
- */
+ /* One reference from device. */
in6_dev_hold(ndev);
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -402,17 +432,15 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
#endif
INIT_LIST_HEAD(&ndev->tempaddr_list);
- setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
+ ndev->desync_factor = U32_MAX;
if ((dev->flags&IFF_LOOPBACK) ||
dev->type == ARPHRD_TUNNEL ||
dev->type == ARPHRD_TUNNEL6 ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_NONE) {
ndev->cnf.use_tempaddr = -1;
- } else {
- in6_dev_hold(ndev);
- ipv6_regen_rndid((unsigned long) ndev);
- }
+ } else
+ ipv6_regen_rndid(ndev);
ndev->token = in6addr_any;
@@ -424,7 +452,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
err = addrconf_sysctl_register(ndev);
if (err) {
ipv6_mc_destroy_dev(ndev);
- del_timer(&ndev->regen_timer);
snmp6_unregister_dev(ndev);
goto err_release;
}
@@ -1167,6 +1194,8 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
int ret = 0;
u32 addr_flags;
unsigned long now = jiffies;
+ long max_desync_factor;
+ s32 cnf_temp_preferred_lft;
write_lock_bh(&idev->lock);
if (ift) {
@@ -1199,23 +1228,42 @@ retry:
}
in6_ifa_hold(ifp);
memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
- __ipv6_try_regen_rndid(idev, tmpaddr);
+ ipv6_try_regen_rndid(idev, tmpaddr);
memcpy(&addr.s6_addr[8], idev->rndid, 8);
age = (now - ifp->tstamp) / HZ;
+
+ regen_advance = idev->cnf.regen_max_retry *
+ idev->cnf.dad_transmits *
+ NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+
+ /* recalculate max_desync_factor each time and update
+ * idev->desync_factor if it's larger
+ */
+ cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft);
+ max_desync_factor = min_t(__u32,
+ idev->cnf.max_desync_factor,
+ cnf_temp_preferred_lft - regen_advance);
+
+ if (unlikely(idev->desync_factor > max_desync_factor)) {
+ if (max_desync_factor > 0) {
+ get_random_bytes(&idev->desync_factor,
+ sizeof(idev->desync_factor));
+ idev->desync_factor %= max_desync_factor;
+ } else {
+ idev->desync_factor = 0;
+ }
+ }
+
tmp_valid_lft = min_t(__u32,
ifp->valid_lft,
idev->cnf.temp_valid_lft + age);
- tmp_prefered_lft = min_t(__u32,
- ifp->prefered_lft,
- idev->cnf.temp_prefered_lft + age -
- idev->cnf.max_desync_factor);
+ tmp_prefered_lft = cnf_temp_preferred_lft + age -
+ idev->desync_factor;
+ tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
tmp_plen = ifp->prefix_len;
tmp_tstamp = ifp->tstamp;
spin_unlock_bh(&ifp->lock);
- regen_advance = idev->cnf.regen_max_retry *
- idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
write_unlock_bh(&idev->lock);
/* A temporary address is created only if this calculated Preferred
@@ -2127,7 +2175,7 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
}
/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
-static void __ipv6_regen_rndid(struct inet6_dev *idev)
+static void ipv6_regen_rndid(struct inet6_dev *idev)
{
regen:
get_random_bytes(idev->rndid, sizeof(idev->rndid));
@@ -2156,43 +2204,10 @@ regen:
}
}
-static void ipv6_regen_rndid(unsigned long data)
-{
- struct inet6_dev *idev = (struct inet6_dev *) data;
- unsigned long expires;
-
- rcu_read_lock_bh();
- write_lock_bh(&idev->lock);
-
- if (idev->dead)
- goto out;
-
- __ipv6_regen_rndid(idev);
-
- expires = jiffies +
- idev->cnf.temp_prefered_lft * HZ -
- idev->cnf.regen_max_retry * idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) -
- idev->cnf.max_desync_factor * HZ;
- if (time_before(expires, jiffies)) {
- pr_warn("%s: too short regeneration interval; timer disabled for %s\n",
- __func__, idev->dev->name);
- goto out;
- }
-
- if (!mod_timer(&idev->regen_timer, expires))
- in6_dev_hold(idev);
-
-out:
- write_unlock_bh(&idev->lock);
- rcu_read_unlock_bh();
- in6_dev_put(idev);
-}
-
-static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
+static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
{
if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
- __ipv6_regen_rndid(idev);
+ ipv6_regen_rndid(idev);
}
/*
@@ -2333,7 +2348,7 @@ static void manage_tempaddrs(struct inet6_dev *idev,
max_valid = 0;
max_prefered = idev->cnf.temp_prefered_lft -
- idev->cnf.max_desync_factor - age;
+ idev->desync_factor - age;
if (max_prefered < 0)
max_prefered = 0;
@@ -2893,6 +2908,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
spin_lock_bh(&ifp->lock);
ifp->flags &= ~IFA_F_TENTATIVE;
spin_unlock_bh(&ifp->lock);
+ rt_genid_bump_ipv6(dev_net(idev->dev));
ipv6_ifa_notify(RTM_NEWADDR, ifp);
in6_ifa_put(ifp);
}
@@ -2995,7 +3011,7 @@ static void init_loopback(struct net_device *dev)
* lo device down, release this obsolete dst and
* reallocate a new router for ifa.
*/
- if (sp_ifa->rt->dst.obsolete > 0) {
+ if (!atomic_read(&sp_ifa->rt->rt6i_ref)) {
ip6_rt_put(sp_ifa->rt);
sp_ifa->rt = NULL;
} else {
@@ -3370,9 +3386,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
}
if (idev) {
- if (idev->if_flags & IF_READY)
- /* device is already configured. */
+ if (idev->if_flags & IF_READY) {
+ /* device is already configured -
+ * but resend MLD reports, we might
+ * have roamed and need to update
+ * multicast snooping switches
+ */
+ ipv6_mc_up(idev);
break;
+ }
idev->if_flags |= IF_READY;
}
@@ -3571,9 +3593,6 @@ restart:
if (!how)
idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
- if (how && del_timer(&idev->regen_timer))
- in6_dev_put(idev);
-
/* Step 3: clear tempaddr list */
while (!list_empty(&idev->tempaddr_list)) {
ifa = list_first_entry(&idev->tempaddr_list,
@@ -3687,7 +3706,7 @@ static void addrconf_rs_timer(unsigned long data)
if (idev->if_flags & IF_RA_RCVD)
goto out;
- if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
+ if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
write_unlock(&idev->lock);
if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
ndisc_send_rs(dev, &lladdr,
@@ -3696,11 +3715,13 @@ static void addrconf_rs_timer(unsigned long data)
goto put;
write_lock(&idev->lock);
+ idev->rs_interval = rfc3315_s14_backoff_update(
+ idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
/* The wait after the last probe can be shorter */
addrconf_mod_rs_timer(idev, (idev->rs_probes ==
idev->cnf.rtr_solicits) ?
idev->cnf.rtr_solicit_delay :
- idev->cnf.rtr_solicit_interval);
+ idev->rs_interval);
} else {
/*
* Note: we do not support deprecated "all on-link"
@@ -3722,12 +3743,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
unsigned long rand_num;
struct inet6_dev *idev = ifp->idev;
+ u64 nonce;
if (ifp->flags & IFA_F_OPTIMISTIC)
rand_num = 0;
else
rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+ nonce = 0;
+ if (idev->cnf.enhanced_dad ||
+ dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) {
+ do
+ get_random_bytes(&nonce, 6);
+ while (nonce == 0);
+ }
+ ifp->dad_nonce = nonce;
ifp->dad_probes = idev->cnf.dad_transmits;
addrconf_mod_dad_work(ifp, rand_num);
}
@@ -3736,7 +3766,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
- bool notify = false;
+ bool bump_id, notify = false;
addrconf_join_solict(dev, &ifp->addr);
@@ -3751,11 +3781,12 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
idev->cnf.accept_dad < 1 ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
+ bump_id = ifp->flags & IFA_F_TENTATIVE;
ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
- addrconf_dad_completed(ifp);
+ addrconf_dad_completed(ifp, bump_id);
return;
}
@@ -3815,8 +3846,8 @@ static void addrconf_dad_work(struct work_struct *w)
struct inet6_ifaddr,
dad_work);
struct inet6_dev *idev = ifp->idev;
+ bool bump_id, disable_ipv6 = false;
struct in6_addr mcaddr;
- bool disable_ipv6 = false;
enum {
DAD_PROCESS,
@@ -3886,11 +3917,12 @@ static void addrconf_dad_work(struct work_struct *w)
* DAD was successful
*/
+ bump_id = ifp->flags & IFA_F_TENTATIVE;
ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
- addrconf_dad_completed(ifp);
+ addrconf_dad_completed(ifp, bump_id);
goto out;
}
@@ -3903,7 +3935,8 @@ static void addrconf_dad_work(struct work_struct *w)
/* send a neighbour solicitation for our addr */
addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
- ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
+ ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
+ ifp->dad_nonce);
out:
in6_ifa_put(ifp);
rtnl_unlock();
@@ -3927,7 +3960,7 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
return true;
}
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
{
struct net_device *dev = ifp->idev->dev;
struct in6_addr lladdr;
@@ -3949,7 +3982,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
send_rs = send_mld &&
ipv6_accept_ra(ifp->idev) &&
- ifp->idev->cnf.rtr_solicits > 0 &&
+ ifp->idev->cnf.rtr_solicits != 0 &&
(dev->flags&IFF_LOOPBACK) == 0;
read_unlock_bh(&ifp->idev->lock);
@@ -3971,13 +4004,23 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
write_lock_bh(&ifp->idev->lock);
spin_lock(&ifp->lock);
+ ifp->idev->rs_interval = rfc3315_s14_backoff_init(
+ ifp->idev->cnf.rtr_solicit_interval);
ifp->idev->rs_probes = 1;
ifp->idev->if_flags |= IF_RS_SENT;
- addrconf_mod_rs_timer(ifp->idev,
- ifp->idev->cnf.rtr_solicit_interval);
+ addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
spin_unlock(&ifp->lock);
write_unlock_bh(&ifp->idev->lock);
}
+
+ if (bump_id)
+ rt_genid_bump_ipv6(dev_net(dev));
+
+ /* Make sure that a new temporary address will be created
+ * before this temporary address becomes deprecated.
+ */
+ if (ifp->flags & IFA_F_TEMPORARY)
+ addrconf_verify_rtnl();
}
static void addrconf_dad_run(struct inet6_dev *idev)
@@ -4891,6 +4934,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
array[DEVCONF_RTR_SOLICIT_INTERVAL] =
jiffies_to_msecs(cnf->rtr_solicit_interval);
+ array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
+ jiffies_to_msecs(cnf->rtr_solicit_max_interval);
array[DEVCONF_RTR_SOLICIT_DELAY] =
jiffies_to_msecs(cnf->rtr_solicit_delay);
array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
@@ -4937,6 +4982,11 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
+ array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
+#endif
+ array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
}
static inline size_t inet6_ifla6_size(void)
@@ -4961,18 +5011,18 @@ static inline size_t inet6_if_nlmsg_size(void)
}
static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
- int items, int bytes)
+ int bytes)
{
int i;
- int pad = bytes - sizeof(u64) * items;
+ int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX;
BUG_ON(pad < 0);
/* Use put_unaligned() because stats may not be aligned for u64. */
- put_unaligned(items, &stats[0]);
- for (i = 1; i < items; i++)
+ put_unaligned(ICMP6_MIB_MAX, &stats[0]);
+ for (i = 1; i < ICMP6_MIB_MAX; i++)
put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
- memset(&stats[items], 0, pad);
+ memset(&stats[ICMP6_MIB_MAX], 0, pad);
}
static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
@@ -5005,7 +5055,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
offsetof(struct ipstats_mib, syncp));
break;
case IFLA_INET6_ICMP6STATS:
- __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
+ __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes);
break;
}
}
@@ -5099,7 +5149,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
return -EINVAL;
if (!ipv6_accept_ra(idev))
return -EINVAL;
- if (idev->cnf.rtr_solicits <= 0)
+ if (idev->cnf.rtr_solicits == 0)
return -EINVAL;
write_lock_bh(&idev->lock);
@@ -5128,8 +5178,10 @@ update_lft:
if (update_rs) {
idev->if_flags |= IF_RS_SENT;
+ idev->rs_interval = rfc3315_s14_backoff_init(
+ idev->cnf.rtr_solicit_interval);
idev->rs_probes = 1;
- addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval);
+ addrconf_mod_rs_timer(idev, idev->rs_interval);
}
/* Well, that's kinda nasty ... */
@@ -5467,20 +5519,6 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
}
static
-int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table lctl;
- int min_hl = 1, max_hl = 255;
-
- lctl = *ctl;
- lctl.extra1 = &min_hl;
- lctl.extra2 = &max_hl;
-
- return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
-}
-
-static
int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -5514,8 +5552,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
struct net_device *dev;
struct inet6_dev *idev;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev(net, dev) {
idev = __in6_dev_get(dev);
if (idev) {
int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
@@ -5524,7 +5561,6 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
dev_disable_change(idev);
}
}
- rcu_read_unlock();
}
static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
@@ -5713,6 +5749,10 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
return ret;
}
+static int minus_one = -1;
+static const int one = 1;
+static const int two_five_five = 255;
+
static const struct ctl_table addrconf_sysctl[] = {
{
.procname = "forwarding",
@@ -5726,7 +5766,9 @@ static const struct ctl_table addrconf_sysctl[] = {
.data = &ipv6_devconf.hop_limit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = addrconf_sysctl_hop_limit,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ .extra2 = (void *)&two_five_five,
},
{
.procname = "mtu",
@@ -5768,7 +5810,8 @@ static const struct ctl_table addrconf_sysctl[] = {
.data = &ipv6_devconf.rtr_solicits,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &minus_one,
},
{
.procname = "router_solicitation_interval",
@@ -5778,6 +5821,13 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
+ .procname = "router_solicitation_max_interval",
+ .data = &ipv6_devconf.rtr_solicit_max_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
.procname = "router_solicitation_delay",
.data = &ipv6_devconf.rtr_solicit_delay,
.maxlen = sizeof(int),
@@ -6027,6 +6077,29 @@ static const struct ctl_table addrconf_sysctl[] = {
},
{
+ .procname = "seg6_enabled",
+ .data = &ipv6_devconf.seg6_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ {
+ .procname = "seg6_require_hmac",
+ .data = &ipv6_devconf.seg6_require_hmac,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "enhanced_dad",
+ .data = &ipv6_devconf.enhanced_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
/* sentinel */
}
};
@@ -6044,8 +6117,14 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
for (i = 0; table[i].data; i++) {
table[i].data += (char *)p - (char *)&ipv6_devconf;
- table[i].extra1 = idev; /* embedded; no ref */
- table[i].extra2 = net;
+ /* If one of these is already set, then it is not safe to
+ * overwrite either of them: this makes proc_dointvec_minmax
+ * usable.
+ */
+ if (!table[i].extra1 && !table[i].extra2) {
+ table[i].extra1 = idev; /* embedded; no ref */
+ table[i].extra2 = net;
+ }
}
snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b454055ba625..aa42123bc301 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -61,8 +61,9 @@
#include <net/ip6_tunnel.h>
#endif
#include <net/calipso.h>
+#include <net/seg6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mroute6.h>
#include "ip6_offload.h"
@@ -257,6 +258,14 @@ lookup_protocol:
goto out;
}
}
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
+ }
out:
return err;
out_rcu_unlock:
@@ -545,6 +554,8 @@ const struct proto_ops inet6_stream_ops = {
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
+ .read_sock = tcp_read_sock,
+ .peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -676,6 +687,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
rcu_read_lock();
@@ -988,6 +1000,10 @@ static int __init inet6_init(void)
if (err)
goto calipso_fail;
+ err = seg6_init();
+ if (err)
+ goto seg6_fail;
+
#ifdef CONFIG_SYSCTL
err = ipv6_sysctl_register();
if (err)
@@ -998,8 +1014,10 @@ out:
#ifdef CONFIG_SYSCTL
sysctl_fail:
- calipso_exit();
+ seg6_exit();
#endif
+seg6_fail:
+ calipso_exit();
calipso_fail:
pingv6_exit();
pingv6_fail:
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 0630a4d5daaa..189eb10b742d 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -662,9 +662,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 37874e2f30ed..eec27f87efac 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -33,7 +33,7 @@
#include <net/dsfield.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static bool ipv6_mapped_addr_any(const struct in6_addr *a)
{
@@ -54,6 +54,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
fl6->fl6_dport = inet->inet_dport;
fl6->fl6_sport = inet->inet_sport;
fl6->flowlabel = np->flow_label;
+ fl6->flowi6_uid = sk->sk_uid;
if (!fl6->flowi6_oif)
fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
@@ -139,7 +140,8 @@ void ip6_datagram_release_cb(struct sock *sk)
}
EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
-static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -165,18 +167,22 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
if (np->sndflow)
fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- addr_type = ipv6_addr_type(&usin->sin6_addr);
-
- if (addr_type == IPV6_ADDR_ANY) {
+ if (ipv6_addr_any(&usin->sin6_addr)) {
/*
* connect to self
*/
- usin->sin6_addr.s6_addr[15] = 0x01;
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ &usin->sin6_addr);
+ else
+ usin->sin6_addr = in6addr_loopback;
}
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
daddr = &usin->sin6_addr;
- if (addr_type == IPV6_ADDR_MAPPED) {
+ if (addr_type & IPV6_ADDR_MAPPED) {
struct sockaddr_in sin;
if (__ipv6_only_sock(sk)) {
@@ -252,6 +258,7 @@ ipv4_connected:
out:
return err;
}
+EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -698,7 +705,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
struct sockaddr_in6 sin6;
__be16 *ports = (__be16 *) skb_transport_header(skb);
- if (skb_transport_offset(skb) + 4 <= skb->len) {
+ if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
@@ -715,6 +722,11 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
}
}
+ if (np->rxopt.bits.recvfragsize && opt->frag_max_size) {
+ int val = opt->frag_max_size;
+
+ put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val);
+ }
}
void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 060a60b2f8a6..cbcdd5db31f4 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -418,7 +418,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
esph = (void *)skb_push(skb, 4);
*seqhi = esph->spi;
esph->spi = esph->seq_no;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
}
@@ -474,9 +474,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 139ceb68bd37..275cac628a95 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -47,6 +47,11 @@
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
+#include <linux/seg6.h>
+#include <net/seg6.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
#include <linux/uaccess.h>
@@ -227,7 +232,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
ipv6h->saddr = hao->addr;
hao->addr = tmp_addr;
- if (skb->tstamp.tv64 == 0)
+ if (skb->tstamp == 0)
__net_timestamp(skb);
return true;
@@ -286,6 +291,157 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
return -1;
}
+static void seg6_update_csum(struct sk_buff *skb)
+{
+ struct ipv6_sr_hdr *hdr;
+ struct in6_addr *addr;
+ __be32 from, to;
+
+ /* srh is at transport offset and seg_left is already decremented
+ * but daddr is not yet updated with next segment
+ */
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+ addr = hdr->segments + hdr->segments_left;
+
+ hdr->segments_left++;
+ from = *(__be32 *)hdr;
+
+ hdr->segments_left--;
+ to = *(__be32 *)hdr;
+
+ /* update skb csum with diff resulting from seg_left decrement */
+
+ update_csum_diff4(skb, from, to);
+
+ /* compute csum diff between current and next segment and update */
+
+ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
+ (__be32 *)addr);
+}
+
+static int ipv6_srh_rcv(struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ipv6_sr_hdr *hdr;
+ struct inet6_dev *idev;
+ struct in6_addr *addr;
+ int accept_seg6;
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_seg6 = net->ipv6.devconf_all->seg6_enabled;
+ if (accept_seg6 > idev->cnf.seg6_enabled)
+ accept_seg6 = idev->cnf.seg6_enabled;
+
+ if (!accept_seg6) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (!seg6_hmac_validate_skb(skb)) {
+ kfree_skb(skb);
+ return -1;
+ }
+#endif
+
+looped_back:
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+
+ if (!pskb_pull(skb, offset)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+ }
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ hdr->segments_left--;
+ addr = hdr->segments + hdr->segments_left;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ seg6_update_csum(skb);
+
+ ipv6_hdr(skb)->daddr = *addr;
+
+ skb_dst_drop(skb);
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
/********************************
Routing header.
********************************/
@@ -326,6 +482,10 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
return -1;
}
+ /* segment routing */
+ if (hdr->type == IPV6_SRCRT_TYPE_4)
+ return ipv6_srh_rcv(skb);
+
looped_back:
if (hdr->segments_left == 0) {
switch (hdr->type) {
@@ -679,9 +839,9 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
* for headers.
*/
-static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
- struct ipv6_rt_hdr *opt,
- struct in6_addr **addr_p)
+static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
{
struct rt0_hdr *phdr, *ihdr;
int hops;
@@ -704,6 +864,62 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
*proto = NEXTHDR_ROUTING;
}
+static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
+{
+ struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
+ int plen, hops;
+
+ sr_ihdr = (struct ipv6_sr_hdr *)opt;
+ plen = (sr_ihdr->hdrlen + 1) << 3;
+
+ sr_phdr = (struct ipv6_sr_hdr *)skb_push(skb, plen);
+ memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr));
+
+ hops = sr_ihdr->first_segment + 1;
+ memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1,
+ (hops - 1) * sizeof(struct in6_addr));
+
+ sr_phdr->segments[0] = **addr_p;
+ *addr_p = &sr_ihdr->segments[hops - 1];
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(sr_phdr)) {
+ struct net *net = NULL;
+
+ if (skb->dev)
+ net = dev_net(skb->dev);
+ else if (skb->sk)
+ net = sock_net(skb->sk);
+
+ WARN_ON(!net);
+
+ if (net)
+ seg6_push_hmac(net, saddr, sr_phdr);
+ }
+#endif
+
+ sr_phdr->nexthdr = *proto;
+ *proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
+{
+ switch (opt->type) {
+ case IPV6_SRCRT_TYPE_0:
+ ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
+ break;
+ case IPV6_SRCRT_TYPE_4:
+ ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
+ break;
+ default:
+ break;
+ }
+}
+
static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
{
struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
@@ -715,10 +931,10 @@ static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv
void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
u8 *proto,
- struct in6_addr **daddr)
+ struct in6_addr **daddr, struct in6_addr *saddr)
{
if (opt->srcrt) {
- ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+ ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
/*
* IPV6_RTHDRDSTOPTS is ignored
* unless IPV6_RTHDR is set (RFC3542).
@@ -945,7 +1161,22 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
return NULL;
*orig = fl6->daddr;
- fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
+
+ switch (opt->srcrt->type) {
+ case IPV6_SRCRT_TYPE_0:
+ fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
+ break;
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;
+
+ fl6->daddr = srh->segments[srh->first_segment];
+ break;
+ }
+ default:
+ return NULL;
+ }
+
return orig;
}
EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5857c1fc8b67..eea23b57c6a5 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index bd59c343d35f..3036f665e6c8 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -70,7 +70,7 @@
#include <net/dsfield.h>
#include <net/l3mdev.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* The ICMP socket(s). This is the most convenient way to flow control
@@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
@@ -447,8 +448,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (__ipv6_addr_needs_scope_id(addr_type))
iif = skb->dev->ifindex;
- else
- iif = l3mdev_master_ifindex(skb->dev);
+ else {
+ dst = skb_dst(skb);
+ iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+ }
/*
* Must not send error if the source does not uniquely
@@ -484,6 +487,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
@@ -658,6 +662,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index ec9efbcdad35..aba0998ddbfb 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -172,6 +172,5 @@ static void __exit ila_fini(void)
module_init(ila_init);
module_exit(ila_fini);
-MODULE_ALIAS_RTNL_LWT(ILA);
MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index c8314c6b6154..13b5e85fe0d5 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -6,29 +6,88 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <net/checksum.h>
+#include <net/dst_cache.h>
#include <net/ip.h>
#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
#include <net/lwtunnel.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
#include "ila.h"
+struct ila_lwt {
+ struct ila_params p;
+ struct dst_cache dst_cache;
+ u32 connected : 1;
+};
+
+static inline struct ila_lwt *ila_lwt_lwtunnel(
+ struct lwtunnel_state *lwt)
+{
+ return (struct ila_lwt *)lwt->data;
+}
+
static inline struct ila_params *ila_params_lwtunnel(
- struct lwtunnel_state *lwstate)
+ struct lwtunnel_state *lwt)
{
- return (struct ila_params *)lwstate->data;
+ return &ila_lwt_lwtunnel(lwt)->p;
}
static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct rt6_info *rt = (struct rt6_info *)orig_dst;
+ struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
+ struct dst_entry *dst;
+ int err = -EINVAL;
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true);
+ ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
+ true);
- return dst->lwtstate->orig_output(net, sk, skb);
+ if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
+ /* Already have a next hop address in route, no need for
+ * dest cache route.
+ */
+ return orig_dst->lwtstate->orig_output(net, sk, skb);
+ }
+
+ dst = dst_cache_get(&ilwt->dst_cache);
+ if (unlikely(!dst)) {
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ /* Lookup a route for the new destination. Take into
+ * account that the base route may already have a gateway.
+ */
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = orig_dst->dev->ifindex;
+ fl6.flowi6_iif = LOOPBACK_IFINDEX;
+ fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst,
+ &ip6h->daddr);
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = -EHOSTUNREACH;
+ dst_release(dst);
+ goto drop;
+ }
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto drop;
+ }
+
+ if (ilwt->connected)
+ dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr);
+ }
+
+ skb_dst_set(skb, dst);
+ return dst_output(net, sk, skb);
drop:
kfree_skb(skb);
@@ -51,7 +110,7 @@ drop:
return -EINVAL;
}
-static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
};
@@ -60,9 +119,9 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
+ struct ila_lwt *ilwt;
struct ila_params *p;
struct nlattr *tb[ILA_ATTR_MAX + 1];
- size_t encap_len = sizeof(*p);
struct lwtunnel_state *newts;
const struct fib6_config *cfg6 = cfg;
struct ila_addr *iaddr;
@@ -71,7 +130,7 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
if (family != AF_INET6)
return -EINVAL;
- if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) {
+ if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
/* Need to have full locator and at least type field
* included in destination
*/
@@ -95,11 +154,17 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
if (!tb[ILA_ATTR_LOCATOR])
return -EINVAL;
- newts = lwtunnel_state_alloc(encap_len);
+ newts = lwtunnel_state_alloc(sizeof(*ilwt));
if (!newts)
return -ENOMEM;
- newts->len = encap_len;
+ ilwt = ila_lwt_lwtunnel(newts);
+ ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC);
+ if (ret) {
+ kfree(newts);
+ return ret;
+ }
+
p = ila_params_lwtunnel(newts);
p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
@@ -120,11 +185,19 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
LWTUNNEL_STATE_INPUT_REDIRECT;
+ if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
+ ilwt->connected = 1;
+
*ts = newts;
return 0;
}
+static void ila_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache);
+}
+
static int ila_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
@@ -159,11 +232,13 @@ static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
static const struct lwtunnel_encap_ops ila_encap_ops = {
.build_state = ila_build_state,
+ .destroy_state = ila_destroy_state,
.output = ila_output,
.input = ila_input,
.fill_encap = ila_fill_encap_info,
.get_encap_size = ila_encap_nlsize,
.cmp_encap = ila_encap_cmp,
+ .owner = THIS_MODULE,
};
int ila_lwt_init(void)
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e6eca5fdf4c9..af8f52ee7180 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -118,17 +118,9 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = ila_cmpfn,
};
-static struct genl_family ila_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = ILA_GENL_NAME,
- .version = ILA_GENL_VERSION,
- .maxattr = ILA_ATTR_MAX,
- .netnsok = true,
- .parallel_ops = true,
-};
+static struct genl_family ila_nl_family;
-static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
@@ -482,7 +474,15 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
{
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
GFP_KERNEL);
@@ -490,16 +490,18 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
static int ila_nl_dump_done(struct netlink_callback *cb)
{
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
rhashtable_walk_exit(&iter->rhiter);
+ kfree(iter);
+
return 0;
}
static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
struct rhashtable_iter *rhiter = &iter->rhiter;
struct ila_map *ila;
int ret;
@@ -561,6 +563,18 @@ static const struct genl_ops ila_nl_ops[] = {
},
};
+static struct genl_family ila_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = ILA_GENL_NAME,
+ .version = ILA_GENL_VERSION,
+ .maxattr = ILA_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .ops = ila_nl_ops,
+ .n_ops = ARRAY_SIZE(ila_nl_ops),
+};
+
#define ILA_HASH_TABLE_SIZE 1024
static __net_init int ila_init_net(struct net *net)
@@ -623,7 +637,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
return 0;
}
-int ila_xlat_init(void)
+int __init ila_xlat_init(void)
{
int ret;
@@ -631,8 +645,7 @@ int ila_xlat_init(void)
if (ret)
goto exit;
- ret = genl_register_family_with_ops(&ila_nl_family,
- ila_nl_ops);
+ ret = genl_register_family(&ila_nl_family);
if (ret < 0)
goto unregister;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 532c3ef282c5..75c308239243 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -29,11 +29,12 @@
#include <net/sock_reuseport.h>
int inet6_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb, bool relax)
+ const struct inet_bind_bucket *tb, bool relax,
+ bool reuseport_ok)
{
const struct sock *sk2;
- int reuse = sk->sk_reuse;
- int reuseport = sk->sk_reuseport;
+ bool reuse = !!sk->sk_reuse;
+ bool reuseport = !!sk->sk_reuseport && reuseport_ok;
kuid_t uid = sock_i_uid((struct sock *)sk);
/* We must walk the whole port owner list in this case. -DaveM */
@@ -88,6 +89,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_mark = ireq->ir_mark;
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
+ fl6->flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(fl6));
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -136,6 +138,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
+ fl6->flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
rcu_read_lock();
@@ -173,7 +176,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
/* Restore final destination back after routing done */
fl6.daddr = sk->sk_v6_daddr;
- res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+ res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
np->tclass);
rcu_read_unlock();
return res;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 00cf28ad4565..02761c9fe43e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum,
const struct in6_addr *daddr,
- const int dif)
+ const int dif, bool exact_dif)
{
int score = -1;
@@ -109,7 +109,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score++;
}
- if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if || exact_dif) {
if (sk->sk_bound_dev_if != dif)
return -1;
score++;
@@ -131,11 +131,12 @@ struct sock *inet6_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore = 0, matches = 0, reuseport = 0;
+ bool exact_dif = inet6_exact_dif_match(net, skb);
struct sock *sk, *result = NULL;
u32 phash = 0;
sk_for_each(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif);
+ score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
if (score > hiscore) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -263,13 +264,15 @@ EXPORT_SYMBOL_GPL(inet6_hash_connect);
int inet6_hash(struct sock *sk)
{
+ int err = 0;
+
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
+ err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
local_bh_enable();
}
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(inet6_hash);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 771be1fa4176..ef5485204522 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -743,6 +743,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ u16 nlflags = NLM_F_EXCL;
int err;
ins = &fn->leaf;
@@ -759,6 +760,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
if (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_EXCL))
return -EEXIST;
+
+ nlflags &= ~NLM_F_EXCL;
if (replace) {
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
found++;
@@ -856,6 +859,7 @@ next_iter:
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
+ nlflags |= NLM_F_CREATE;
err = fib6_commit_metrics(&rt->dst, mxc);
if (err)
return err;
@@ -864,7 +868,7 @@ add:
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
if (!(fn->fn_flags & RTN_RTINFO)) {
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b912f0dbaf72..8081bafe441b 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -29,7 +29,7 @@
#include <net/rawv6.h>
#include <net/transp_v6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified
in old IPv6 RFC. Well, it was reasonable value.
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index edc3daab354e..630b73be5999 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -61,12 +61,12 @@ static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-#define HASH_SIZE_SHIFT 5
-#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+#define IP6_GRE_HASH_SIZE_SHIFT 5
+#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
-static int ip6gre_net_id __read_mostly;
+static unsigned int ip6gre_net_id __read_mostly;
struct ip6gre_net {
- struct ip6_tnl __rcu *tunnels[4][HASH_SIZE];
+ struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
struct net_device *fb_tunnel_dev;
};
@@ -96,12 +96,12 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
will match fallback tunnel.
*/
-#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1))
+#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
static u32 HASH_ADDR(const struct in6_addr *addr)
{
u32 hash = ipv6_addr_hash(addr);
- return hash_32(hash, HASH_SIZE_SHIFT);
+ return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT);
}
#define tunnels_r_l tunnels[3]
@@ -367,35 +367,37 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info)
+ u8 type, u8 code, int offset, __be32 info)
{
- const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data;
- __be16 *p = (__be16 *)(skb->data + offset);
- int grehlen = offset + 4;
+ const struct gre_base_hdr *greh;
+ const struct ipv6hdr *ipv6h;
+ int grehlen = sizeof(*greh);
struct ip6_tnl *t;
+ int key_off = 0;
__be16 flags;
+ __be32 key;
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
- return;
- if (flags&GRE_KEY) {
- grehlen += 4;
- if (flags&GRE_CSUM)
- grehlen += 4;
- }
+ if (!pskb_may_pull(skb, offset + grehlen))
+ return;
+ greh = (const struct gre_base_hdr *)(skb->data + offset);
+ flags = greh->flags;
+ if (flags & (GRE_VERSION | GRE_ROUTING))
+ return;
+ if (flags & GRE_CSUM)
+ grehlen += 4;
+ if (flags & GRE_KEY) {
+ key_off = grehlen + offset;
+ grehlen += 4;
}
- /* If only 8 bytes returned, keyed message will be dropped here */
- if (!pskb_may_pull(skb, grehlen))
+ if (!pskb_may_pull(skb, offset + grehlen))
return;
ipv6h = (const struct ipv6hdr *)skb->data;
- p = (__be16 *)(skb->data + offset);
+ greh = (const struct gre_base_hdr *)(skb->data + offset);
+ key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
- flags & GRE_KEY ?
- *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
- p[1]);
+ key, greh->protocol);
if (!t)
return;
@@ -548,6 +550,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
return -1;
@@ -580,6 +584,9 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
return -1;
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+ ipv6h = ipv6_hdr(skb);
+
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
@@ -602,6 +609,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
return -1;
@@ -1086,7 +1095,7 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
for (prio = 0; prio < 4; prio++) {
int h;
- for (h = 0; h < HASH_SIZE; h++) {
+ for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
struct ip6_tnl *t;
t = rtnl_dereference(ign->tunnels[prio][h]);
@@ -1238,7 +1247,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
if (data[IFLA_GRE_FLOWINFO])
- parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]);
+ parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
if (data[IFLA_GRE_FLAGS])
parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e56b5a9..fc7b4017ba24 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
int offset = 0;
bool encap, udpfrag;
int nhoff;
+ bool gso_partial;
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -98,12 +99,14 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
segs = ops->callbacks.gso_segment(skb, features);
}
- if (IS_ERR(segs))
+ if (IS_ERR_OR_NULL(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
for (skb = segs; skb; skb = skb->next) {
ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
- if (skb_is_gso(skb))
+ if (gso_partial)
payload_len = skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)(ipv6h + 1);
@@ -188,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) {
__pskb_pull(skb, skb_gro_offset(skb));
+ skb_gro_frag0_invalidate(skb);
proto = ipv6_gso_pull_exthdrs(skb, proto);
skb_gro_pull(skb, -skb_transport_offset(skb));
skb_reset_transport_header(skb);
@@ -243,7 +247,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
skb_gro_postpull_rcsum(skb, iph, nlen);
- pp = ops->callbacks.gro_receive(head, skb);
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1dfc402d9ad1..7cebee58e55b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
@@ -56,6 +57,7 @@
#include <net/checksum.h>
#include <linux/mroute6.h>
#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -104,6 +106,13 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
}
}
+ if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+ int res = lwtunnel_xmit(skb);
+
+ if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+ return res;
+ }
+
rcu_read_lock_bh();
nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
@@ -123,6 +132,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
@@ -155,7 +172,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
* which are using proper atomic operations or spinlocks.
*/
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- struct ipv6_txoptions *opt, int tclass)
+ __u32 mark, struct ipv6_txoptions *opt, int tclass)
{
struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -195,7 +212,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+ ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
+ &fl6->saddr);
}
skb_push(skb, sizeof(struct ipv6hdr));
@@ -222,12 +240,20 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
skb->protocol = htons(ETH_P_IPV6);
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = mark;
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUT, skb->len);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out((struct sock *)sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
/* hooks should never assume socket lock is held.
* we promote our socket to non const
*/
@@ -608,7 +634,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
- int first_len = skb_pagelen(skb);
+ unsigned int first_len = skb_pagelen(skb);
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
@@ -910,13 +936,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
int err;
int flags = 0;
- if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
- (!*dst || !(*dst)->error)) {
- err = l3mdev_get_saddr6(net, sk, fl6);
- if (err)
- goto out_err;
- }
-
/* The correct way to handle this would be to do
* ip6_route_get_saddr, and then ip6_route_output; however,
* the route-specific preferred source forces the
@@ -1002,13 +1021,18 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
}
}
#endif
+ if (ipv6_addr_v4mapped(&fl6->saddr) &&
+ !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
+ err = -EAFNOSUPPORT;
+ goto out_err_release;
+ }
return 0;
out_err_release:
dst_release(*dst);
*dst = NULL;
-out_err:
+
if (err == -ENETUNREACH)
IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
return err;
@@ -1054,8 +1078,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
- if (!fl6->flowi6_oif)
- fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
}
@@ -1327,7 +1349,7 @@ emsgsize:
*/
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
- length < mtu - headersize &&
+ length <= mtu - headersize &&
!(flags & MSG_MORE) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
@@ -1356,10 +1378,10 @@ emsgsize:
*/
cork->length += length;
- if (((length > mtu) ||
+ if ((((length + fragheaderlen) > mtu) ||
(skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) &&
+ (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
(sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, exthdrlen,
@@ -1665,7 +1687,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
+ ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 888543debe4e..75fac933c209 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -42,7 +42,7 @@
#include <linux/hash.h>
#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <net/icmp.h>
@@ -57,6 +57,7 @@
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -64,8 +65,8 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("ip6tnl");
MODULE_ALIAS_NETDEV("ip6tnl0");
-#define HASH_SIZE_SHIFT 5
-#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+#define IP6_TUNNEL_HASH_SIZE_SHIFT 5
+#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT)
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
@@ -75,21 +76,22 @@ static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
{
u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
- return hash_32(hash, HASH_SIZE_SHIFT);
+ return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT);
}
static int ip6_tnl_dev_init(struct net_device *dev);
static void ip6_tnl_dev_setup(struct net_device *dev);
static struct rtnl_link_ops ip6_link_ops __read_mostly;
-static int ip6_tnl_net_id __read_mostly;
+static unsigned int ip6_tnl_net_id __read_mostly;
struct ip6_tnl_net {
/* the IPv6 tunnel fallback device */
struct net_device *fb_tnl_dev;
/* lists for storing tunnels in use */
- struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
struct ip6_tnl __rcu *tnls_wc[1];
struct ip6_tnl __rcu **tnls[2];
+ struct ip6_tnl __rcu *collect_md_tun;
};
static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -155,6 +157,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
hash = HASH(&any, local);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_any(&t->parms.raddr) &&
(t->dev->flags & IFF_UP))
return t;
}
@@ -162,10 +165,15 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
hash = HASH(remote, &any);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
if (ipv6_addr_equal(remote, &t->parms.raddr) &&
+ ipv6_addr_any(&t->parms.laddr) &&
(t->dev->flags & IFF_UP))
return t;
}
+ t = rcu_dereference(ip6n->collect_md_tun);
+ if (t)
+ return t;
+
t = rcu_dereference(ip6n->tnls_wc[0]);
if (t && (t->dev->flags & IFF_UP))
return t;
@@ -209,6 +217,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ip6n->collect_md_tun, t);
rcu_assign_pointer(t->next , rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
}
@@ -224,6 +234,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
struct ip6_tnl __rcu **tp;
struct ip6_tnl *iter;
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
for (tp = ip6_tnl_bucket(ip6n, &t->parms);
(iter = rtnl_dereference(*tp)) != NULL;
tp = &iter->next) {
@@ -387,18 +400,19 @@ ip6_tnl_dev_uninit(struct net_device *dev)
__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
{
- const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw;
- __u8 nexthdr = ipv6h->nexthdr;
- __u16 off = sizeof(*ipv6h);
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw;
+ unsigned int nhoff = raw - skb->data;
+ unsigned int off = nhoff + sizeof(*ipv6h);
+ u8 next, nexthdr = ipv6h->nexthdr;
while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
- __u16 optlen = 0;
struct ipv6_opt_hdr *hdr;
- if (raw + off + sizeof(*hdr) > skb->data &&
- !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
+ u16 optlen;
+
+ if (!pskb_may_pull(skb, off + sizeof(*hdr)))
break;
- hdr = (struct ipv6_opt_hdr *) (raw + off);
+ hdr = (struct ipv6_opt_hdr *)(skb->data + off);
if (nexthdr == NEXTHDR_FRAGMENT) {
struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
if (frag_hdr->frag_off)
@@ -409,20 +423,29 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
} else {
optlen = ipv6_optlen(hdr);
}
+ /* cache hdr->nexthdr, since pskb_may_pull() might
+ * invalidate hdr
+ */
+ next = hdr->nexthdr;
if (nexthdr == NEXTHDR_DEST) {
- __u16 i = off + 2;
+ u16 i = 2;
+
+ /* Remember : hdr is no longer valid at this point. */
+ if (!pskb_may_pull(skb, off + optlen))
+ break;
+
while (1) {
struct ipv6_tlv_tnl_enc_lim *tel;
/* No more room for encapsulation limit */
- if (i + sizeof (*tel) > off + optlen)
+ if (i + sizeof(*tel) > optlen)
break;
- tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
+ tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i);
/* return index of option if found and valid */
if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
tel->length == 1)
- return i;
+ return i + off - nhoff;
/* else jump to next option */
if (tel->type)
i += tel->length + 2;
@@ -430,7 +453,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
i++;
}
}
- nexthdr = hdr->nexthdr;
+ nexthdr = next;
off += optlen;
}
return 0;
@@ -829,6 +852,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -865,6 +891,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
{
struct ip6_tnl *t;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct metadata_dst *tun_dst = NULL;
int ret = -1;
rcu_read_lock();
@@ -881,7 +908,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
goto drop;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+ if (t->parms.collect_md) {
+ tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+ if (!tun_dst)
+ return 0;
+ }
+ ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_error);
}
@@ -1012,8 +1044,17 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
int mtu;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen;
+ bool use_cache = false;
+ u8 hop_limit;
int err = -1;
+ if (t->parms.collect_md) {
+ hop_limit = skb_tunnel_info(skb)->key.ttl;
+ goto route_lookup;
+ } else {
+ hop_limit = t->parms.hop_limit;
+ }
+
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
struct in6_addr *addr6;
@@ -1036,13 +1077,22 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
- } else if (!fl6->flowi6_mark)
+ } else if (!(t->parms.flags &
+ (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
+ /* enable the cache only only if the routing decision does
+ * not depend on the current inner header value
+ */
+ use_cache = true;
+ }
+
+ if (use_cache)
dst = dst_cache_get(&t->dst_cache);
if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
goto tx_err_link_failure;
if (!dst) {
+route_lookup:
dst = ip6_route_output(net, NULL, fl6);
if (dst->error)
@@ -1053,6 +1103,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
dst = NULL;
goto tx_err_link_failure;
}
+ if (t->parms.collect_md &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl6->daddr, 0, &fl6->saddr))
+ goto tx_err_link_failure;
ndst = dst;
}
@@ -1064,16 +1118,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
t->parms.name);
goto tx_err_dst_release;
}
- mtu = dst_mtu(dst) - psh_hlen;
+ mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
}
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- if (skb_dst(skb))
+ if (skb_dst(skb) && !t->parms.collect_md)
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
- if (skb->len > mtu && !skb_is_gso(skb)) {
+ if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
err = -EMSGSIZE;
goto tx_err_dst_release;
@@ -1111,13 +1165,18 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
skb = new_skb;
}
- if (!fl6->flowi6_mark && ndst)
- dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+ if (t->parms.collect_md) {
+ if (t->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_err_dst_release;
+ } else {
+ if (use_cache && ndst)
+ dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+ }
skb_dst_set(skb, dst);
if (encap_limit >= 0) {
init_tel_txopt(&opt, encap_limit);
- ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
+ ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL, NULL);
}
/* Calculate max headroom for all the headers and adjust
@@ -1137,7 +1196,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
- ipv6h->hop_limit = t->parms.hop_limit;
+ ipv6h->hop_limit = hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
@@ -1170,19 +1229,36 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (tproto != IPPROTO_IPIP && tproto != 0)
return -1;
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
+ dsfield = ipv4_get_dsfield(iph);
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
+ if (t->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
- dsfield = ipv4_get_dsfield(iph);
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
+ return -1;
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ } else {
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
- & IPV6_TCLASS_MASK;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPIP;
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+ & IPV6_TCLASS_MASK;
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+ }
+
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1220,29 +1296,51 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
ip6_tnl_addr_conflict(t, ipv6h))
return -1;
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
- if (offset > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
+ dsfield = ipv6_get_dsfield(ipv6h);
+
+ if (t->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
return -1;
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ } else {
+ offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+ ipv6h = ipv6_hdr(skb);
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+
+ tel = (void *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
+ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+ encap_limit = t->parms.encap_limit;
}
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPV6;
- dsfield = ipv6_get_dsfield(ipv6h);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
- fl6.flowlabel |= ip6_flowlabel(ipv6h);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+ }
+
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1563,7 +1661,7 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
struct ip6_tnl *tnl = netdev_priv(dev);
if (tnl->parms.proto == IPPROTO_IPIP) {
- if (new_mtu < 68)
+ if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
} else {
if (new_mtu < IPV6_MIN_MTU)
@@ -1716,6 +1814,8 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
dev->mtu = ETH_DATA_LEN - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - dev->hard_header_len;
return 0;
@@ -1741,6 +1841,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
if (err)
return err;
ip6_tnl_link_config(t);
+ if (t->parms.collect_md) {
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ netif_keep_dst(dev);
+ }
return 0;
}
@@ -1811,6 +1915,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_PROTO])
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ parms->collect_md = true;
}
static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1850,6 +1957,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct ip6_tnl *nt, *t;
struct ip_tunnel_encap ipencap;
@@ -1864,9 +1972,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
ip6_tnl_netlink_parms(data, &nt->parms);
- t = ip6_tnl_locate(net, &nt->parms, 0);
- if (!IS_ERR(t))
- return -EEXIST;
+ if (nt->parms.collect_md) {
+ if (rtnl_dereference(ip6n->collect_md_tun))
+ return -EEXIST;
+ } else {
+ t = ip6_tnl_locate(net, &nt->parms, 0);
+ if (!IS_ERR(t))
+ return -EEXIST;
+ }
return ip6_tnl_create2(dev);
}
@@ -1890,6 +2003,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
ip6_tnl_netlink_parms(data, &p);
+ if (p.collect_md)
+ return -EINVAL;
t = ip6_tnl_locate(net, &p, 0);
if (!IS_ERR(t)) {
@@ -1937,6 +2052,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -1955,16 +2072,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
- tunnel->encap.type) ||
- nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
- tunnel->encap.sport) ||
- nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
- tunnel->encap.dport) ||
- nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
- tunnel->encap.flags))
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
goto nla_put_failure;
+ if (parm->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -1992,6 +2108,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ip6_link_ops __read_mostly = {
@@ -2033,7 +2150,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net)
if (dev->rtnl_link_ops == &ip6_link_ops)
unregister_netdevice_queue(dev, &list);
- for (h = 0; h < HASH_SIZE; h++) {
+ for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
t = rtnl_dereference(ip6n->tnls_r_l[h]);
while (t) {
/* If dev is in the same netns, it has already
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index a7520528ecd2..b283f293ee4a 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -88,9 +88,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
uh->len = htons(skb->len);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
- | IPSKB_REROUTED);
skb_dst_set(skb, dst);
udp6_set_csum(nocheck, skb, saddr, daddr, skb->len);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 5bd3afdcc771..d82042c8d8fd 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -50,26 +50,26 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#define HASH_SIZE_SHIFT 5
-#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+#define IP6_VTI_HASH_SIZE_SHIFT 5
+#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
{
u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
- return hash_32(hash, HASH_SIZE_SHIFT);
+ return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT);
}
static int vti6_dev_init(struct net_device *dev);
static void vti6_dev_setup(struct net_device *dev);
static struct rtnl_link_ops vti6_link_ops __read_mostly;
-static int vti6_net_id __read_mostly;
+static unsigned int vti6_net_id __read_mostly;
struct vti6_net {
/* the vti6 tunnel fallback device */
struct net_device *fb_tnl_dev;
/* lists for storing tunnels in use */
- struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE];
struct ip6_tnl __rcu *tnls_wc[1];
struct ip6_tnl __rcu **tnls[2];
};
@@ -189,12 +189,12 @@ static int vti6_tnl_create2(struct net_device *dev)
struct vti6_net *ip6n = net_generic(net, vti6_net_id);
int err;
+ dev->rtnl_link_ops = &vti6_link_ops;
err = register_netdevice(dev);
if (err < 0)
goto out;
strcpy(t->parms.name, dev->name);
- dev->rtnl_link_ops = &vti6_link_ops;
dev_hold(dev);
vti6_tnl_link(ip6n, t);
@@ -608,9 +608,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
@@ -812,30 +813,11 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return err;
}
-/**
- * vti6_tnl_change_mtu - change mtu manually for tunnel device
- * @dev: virtual device associated with tunnel
- * @new_mtu: the new mtu
- *
- * Return:
- * 0 on success,
- * %-EINVAL if mtu too small
- **/
-static int vti6_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < IPV6_MIN_MTU)
- return -EINVAL;
-
- dev->mtu = new_mtu;
- return 0;
-}
-
static const struct net_device_ops vti6_netdev_ops = {
.ndo_init = vti6_dev_init,
.ndo_uninit = vti6_dev_uninit,
.ndo_start_xmit = vti6_tnl_xmit,
.ndo_do_ioctl = vti6_ioctl,
- .ndo_change_mtu = vti6_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -855,6 +837,8 @@ static void vti6_dev_setup(struct net_device *dev)
dev->type = ARPHRD_TUNNEL6;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = IPV6_MIN_MTU;
+ dev->max_mtu = IP_MAX_MTU;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
netif_keep_dst(dev);
@@ -1051,7 +1035,7 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
struct ip6_tnl *t;
LIST_HEAD(list);
- for (h = 0; h < HASH_SIZE; h++) {
+ for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
t = rtnl_dereference(ip6n->tnls_r_l[h]);
while (t) {
unregister_netdevice_queue(t->dev, &list);
@@ -1138,6 +1122,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
.priority = 100,
};
+static bool is_vti6_tunnel(const struct net_device *dev)
+{
+ return dev->netdev_ops == &vti6_netdev_ops;
+}
+
+static int vti6_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!is_vti6_tunnel(dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ if (!net_eq(t->net, dev_net(dev)))
+ xfrm_garbage_collect(t->net);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vti6_notifier_block __read_mostly = {
+ .notifier_call = vti6_device_event,
+};
+
/**
* vti6_tunnel_init - register protocol and reserve needed resources
*
@@ -1148,6 +1159,8 @@ static int __init vti6_tunnel_init(void)
const char *msg;
int err;
+ register_netdevice_notifier(&vti6_notifier_block);
+
msg = "tunnel device";
err = register_pernet_device(&vti6_net_ops);
if (err < 0)
@@ -1180,6 +1193,7 @@ xfrm_proto_ah_failed:
xfrm_proto_esp_failed:
unregister_pernet_device(&vti6_net_ops);
pernet_dev_failed:
+ unregister_netdevice_notifier(&vti6_notifier_block);
pr_err("vti6 init: failed to register %s\n", msg);
return err;
}
@@ -1194,6 +1208,7 @@ static void __exit vti6_tunnel_cleanup(void)
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti6_net_ops);
+ unregister_netdevice_notifier(&vti6_notifier_block);
}
module_init(vti6_tunnel_init);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7f4265b1649b..604d8953c775 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -16,7 +16,7 @@
*
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/errno.h>
@@ -636,7 +636,7 @@ static int pim6_rcv(struct sk_buff *skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
+ if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
(pim->flags & PIM_NULL_REGISTER) ||
(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
sizeof(*pim), IPPROTO_PIM,
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 5330262ab673..ee97c44e2aa0 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -52,8 +52,9 @@
#include <net/udplite.h>
#include <net/xfrm.h>
#include <net/compat.h>
+#include <net/seg6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);
@@ -120,6 +121,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
static bool setsockopt_needs_rtnl(int optname)
{
switch (optname) {
+ case IPV6_ADDRFORM:
case IPV6_ADD_MEMBERSHIP:
case IPV6_DROP_MEMBERSHIP:
case IPV6_JOIN_ANYCAST:
@@ -198,7 +200,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
}
fl6_free_socklist(sk);
- ipv6_sock_mc_close(sk);
+ __ipv6_sock_mc_close(sk);
/*
* Sock is moving from IPv6 to IPv4 (sk_prot), so
@@ -429,6 +431,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
#endif
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)
+ opt->srcrt;
+
+ if (!seg6_validate_srh(srh, optlen))
+ goto sticky_done;
+ break;
+ }
default:
goto sticky_done;
}
@@ -867,6 +878,10 @@ pref_skip_coa:
np->autoflowlabel = valbool;
retv = 0;
break;
+ case IPV6_RECVFRAGSIZE:
+ np->rxopt.bits.recvfragsize = valbool;
+ retv = 0;
+ break;
}
release_sock(sk);
@@ -1309,6 +1324,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = np->autoflowlabel;
break;
+ case IPV6_RECVFRAGSIZE:
+ val = np->rxopt.bits.recvfragsize;
+ break;
+
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index d64ee7e83664..1bdc703cb966 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -81,7 +81,7 @@ static void mld_gq_timer_expire(unsigned long data);
static void mld_ifc_timer_expire(unsigned long data);
static void mld_ifc_event(struct inet6_dev *idev);
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr);
+static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
static void mld_clear_delrec(struct inet6_dev *idev);
static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
@@ -276,16 +276,14 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
return idev;
}
-void ipv6_sock_mc_close(struct sock *sk)
+void __ipv6_sock_mc_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6_mc_socklist *mc_lst;
struct net *net = sock_net(sk);
- if (!rcu_access_pointer(np->ipv6_mc_list))
- return;
+ ASSERT_RTNL();
- rtnl_lock();
while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
struct net_device *dev;
@@ -303,8 +301,17 @@ void ipv6_sock_mc_close(struct sock *sk)
atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
kfree_rcu(mc_lst, rcu);
-
}
+}
+
+void ipv6_sock_mc_close(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (!rcu_access_pointer(np->ipv6_mc_list))
+ return;
+ rtnl_lock();
+ __ipv6_sock_mc_close(sk);
rtnl_unlock();
}
@@ -685,9 +692,9 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
dev_mc_del(dev, buf);
}
- if (mc->mca_flags & MAF_NOREPORT)
- goto done;
spin_unlock_bh(&mc->mca_lock);
+ if (mc->mca_flags & MAF_NOREPORT)
+ return;
if (!mc->idev->dead)
igmp6_leave_group(mc);
@@ -695,8 +702,6 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
spin_lock_bh(&mc->mca_lock);
if (del_timer(&mc->mca_timer))
atomic_dec(&mc->mca_refcnt);
-done:
- ip6_mc_clear_src(mc);
spin_unlock_bh(&mc->mca_lock);
}
@@ -741,10 +746,11 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
spin_unlock_bh(&idev->mc_lock);
}
-static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
+static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
struct ifmcaddr6 *pmc, *pmc_prev;
- struct ip6_sf_list *psf, *psf_next;
+ struct ip6_sf_list *psf;
+ struct in6_addr *pmca = &im->mca_addr;
spin_lock_bh(&idev->mc_lock);
pmc_prev = NULL;
@@ -761,14 +767,21 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
}
spin_unlock_bh(&idev->mc_lock);
+ spin_lock_bh(&im->mca_lock);
if (pmc) {
- for (psf = pmc->mca_tomb; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ im->idev = pmc->idev;
+ im->mca_crcount = idev->mc_qrv;
+ im->mca_sfmode = pmc->mca_sfmode;
+ if (pmc->mca_sfmode == MCAST_INCLUDE) {
+ im->mca_tomb = pmc->mca_tomb;
+ im->mca_sources = pmc->mca_sources;
+ for (psf = im->mca_sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = im->mca_crcount;
}
in6_dev_put(pmc->idev);
kfree(pmc);
}
+ spin_unlock_bh(&im->mca_lock);
}
static void mld_clear_delrec(struct inet6_dev *idev)
@@ -897,7 +910,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
mca_get(mc);
write_unlock_bh(&idev->lock);
- mld_del_delrec(idev, &mc->mca_addr);
+ mld_del_delrec(idev, mc);
igmp6_group_added(mc);
ma_put(mc);
return 0;
@@ -920,6 +933,7 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
write_unlock_bh(&idev->lock);
igmp6_group_dropped(ma);
+ ip6_mc_clear_src(ma);
ma_put(ma);
return 0;
@@ -1739,6 +1753,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
continue;
}
+ /* Based on RFC3810 6.1. Should not send source-list change
+ * records when there is a filter mode change.
+ */
+ if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) ||
+ (!gdeleted && pmc->mca_crcount)) &&
+ (type == MLD2_ALLOW_NEW_SOURCES ||
+ type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount)
+ goto decrease_sf_crcount;
+
/* clear marks on query responses */
if (isquery)
psf->sf_gsresp = 0;
@@ -1766,6 +1789,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
scount++; stotal++;
if ((type == MLD2_ALLOW_NEW_SOURCES ||
type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
@@ -2484,15 +2508,17 @@ void ipv6_mc_down(struct inet6_dev *idev)
/* Withdraw multicast list */
read_lock_bh(&idev->lock);
- mld_ifc_stop_timer(idev);
- mld_gq_stop_timer(idev);
- mld_dad_stop_timer(idev);
for (i = idev->mc_list; i; i = i->next)
igmp6_group_dropped(i);
- read_unlock_bh(&idev->lock);
- mld_clear_delrec(idev);
+ /* Should stop timer after group drop. or we will
+ * start timer again in mld_ifc_event()
+ */
+ mld_ifc_stop_timer(idev);
+ mld_gq_stop_timer(idev);
+ mld_dad_stop_timer(idev);
+ read_unlock_bh(&idev->lock);
}
static void ipv6_mc_reset(struct inet6_dev *idev)
@@ -2514,8 +2540,10 @@ void ipv6_mc_up(struct inet6_dev *idev)
read_lock_bh(&idev->lock);
ipv6_mc_reset(idev);
- for (i = idev->mc_list; i; i = i->next)
+ for (i = idev->mc_list; i; i = i->next) {
+ mld_del_delrec(idev, i);
igmp6_group_added(i);
+ }
read_unlock_bh(&idev->lock);
}
@@ -2548,6 +2576,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
/* Deactivate timers */
ipv6_mc_down(idev);
+ mld_clear_delrec(idev);
/* Delete all-nodes address. */
/* We cannot call ipv6_dev_mc_dec() directly, our caller in
@@ -2562,11 +2591,9 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
write_lock_bh(&idev->lock);
while ((i = idev->mc_list) != NULL) {
idev->mc_list = i->next;
- write_unlock_bh(&idev->lock);
- igmp6_group_dropped(i);
+ write_unlock_bh(&idev->lock);
ma_put(i);
-
write_lock_bh(&idev->lock);
}
write_unlock_bh(&idev->lock);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 60c79a08e14a..64f0f7be9e5e 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -191,7 +191,7 @@ static inline int mip6_report_rl_allow(ktime_t stamp,
int allow = 0;
spin_lock_bh(&mip6_report_rl.lock);
- if (!ktime_equal(mip6_report_rl.stamp, stamp) ||
+ if (mip6_report_rl.stamp != stamp ||
mip6_report_rl.iif != iif ||
!ipv6_addr_equal(&mip6_report_rl.src, src) ||
!ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index fe65cdc28a45..7ebac630d3c6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,7 +67,6 @@
#include <net/flow.h>
#include <net/ip6_checksum.h>
#include <net/inet_common.h>
-#include <net/l3mdev.h>
#include <linux/proc_fs.h>
#include <linux/netfilter.h>
@@ -234,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
+ case ND_OPT_NONCE:
case ND_OPT_REDIRECT_HDR:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
ND_PRINTK(2, warn,
@@ -457,11 +457,9 @@ static void ndisc_send_skb(struct sk_buff *skb,
if (!dst) {
struct flowi6 fl6;
- int oif = l3mdev_fib_oif(skb->dev);
+ int oif = skb->dev->ifindex;
icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
- if (oif != skb->dev->ifindex)
- fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
dst = icmp6_dst_alloc(skb->dev, &fl6);
if (IS_ERR(dst)) {
kfree_skb(skb);
@@ -571,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev)
}
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
- const struct in6_addr *daddr, const struct in6_addr *saddr)
+ const struct in6_addr *daddr, const struct in6_addr *saddr,
+ u64 nonce)
{
struct sk_buff *skb;
struct in6_addr addr_buf;
@@ -591,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (inc_opt)
optlen += ndisc_opt_addr_space(dev,
NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0)
+ optlen += 8;
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -608,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
dev->dev_addr,
NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0) {
+ u8 *opt = skb_put(skb, 8);
+
+ opt[0] = ND_OPT_NONCE;
+ opt[1] = 8 >> 3;
+ memcpy(opt + 2, &nonce, 6);
+ }
ndisc_send_skb(skb, daddr, saddr);
}
@@ -696,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
__func__, target);
}
- ndisc_send_ns(dev, target, target, saddr);
+ ndisc_send_ns(dev, target, target, saddr, 0);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
} else {
addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(dev, target, &mcaddr, saddr);
+ ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
}
}
@@ -745,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
int dad = ipv6_addr_any(saddr);
bool inc;
int is_router = -1;
+ u64 nonce = 0;
if (skb->len < sizeof(struct nd_msg)) {
ND_PRINTK(2, warn, "NS: packet too short\n");
@@ -789,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
return;
}
}
+ if (ndopts.nd_opts_nonce)
+ memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
inc = ipv6_addr_is_multicast(daddr);
@@ -797,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb)
have_ifp:
if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
if (dad) {
+ if (nonce != 0 && ifp->dad_nonce == nonce) {
+ u8 *np = (u8 *)&nonce;
+ /* Matching nonce if looped back */
+ ND_PRINTK(2, notice,
+ "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+ ifp->idev->dev->name,
+ &ifp->addr, np);
+ goto out;
+ }
/*
* We are colliding with another node
* who is doing DAD
@@ -1538,7 +1558,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
int rd_len;
u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
- int oif = l3mdev_fib_oif(dev);
bool ret;
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1555,10 +1574,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
}
icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
- &saddr_buf, &ipv6_hdr(skb)->saddr, oif);
-
- if (oif != skb->dev->ifindex)
- fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
+ &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d11c46833d61..39970e212ad5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
struct flowi6 fl6 = {
.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
+ .flowi6_uid = sock_net_uid(net, skb->sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index e10a04c9cdc7..6acb2eecd986 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV6
To compile it as a module, choose M here. If unsure, say N.
+config NF_SOCKET_IPV6
+ tristate "IPv6 socket lookup support"
+ help
+ This option enables the IPv6 socket lookup infrastructure. This
+ is used by the ip6tables socket match.
+
if NF_TABLES
config NF_TABLES_IPV6
@@ -54,6 +60,14 @@ config NFT_DUP_IPV6
help
This module enables IPv6 packet duplication support for nf_tables.
+config NFT_FIB_IPV6
+ tristate "nf_tables fib / ipv6 route lookup support"
+ select NFT_FIB
+ help
+ This module enables IPv6 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+
endif # NF_TABLES_IPV6
endif # NF_TABLES
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index b4f7d0b4e2af..fe180c96040e 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
+obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+
# logging
obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
@@ -40,6 +42,7 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
+obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 552fac2f390a..25a022d41a70 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -24,7 +24,7 @@
#include <linux/icmpv6.h>
#include <net/ipv6.h>
#include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
@@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_WARNING,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -291,11 +291,7 @@ ip6t_do_table(struct sk_buff *skb,
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.hotdrop = false;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.family = NFPROTO_IPV6;
- acpar.hooknum = hook;
+ acpar.state = state;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -566,7 +562,8 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -574,12 +571,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -616,7 +610,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -703,8 +697,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
-
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -713,6 +706,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ip6t_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ip6t_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -772,7 +766,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -1007,7 +1002,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
"ip6table_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct ip6t_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1037,7 +1032,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(AF_INET6);
@@ -1063,7 +1058,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET6, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1074,7 +1069,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -1099,8 +1094,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
"ip6table_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1214,8 +1209,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
if (IS_ERR(paddc))
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET6, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1651,7 +1646,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
xt_compat_lock(AF_INET6);
t = xt_find_table_lock(net, AF_INET6, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
ret = compat_table_info(private, &info);
@@ -1665,7 +1660,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(AF_INET6);
return ret;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 7f9f45d829d2..2b1a15846f9a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -24,7 +24,7 @@
static unsigned int
masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out);
+ return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
}
static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index db29bbf41b59..fa51a205918d 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -39,35 +39,40 @@ static unsigned int
reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_reject_info *reject = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
switch (reject->with) {
case IP6T_ICMP6_NO_ROUTE:
- nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par));
break;
case IP6T_ICMP6_ADM_PROHIBITED:
- nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_NOT_NEIGHBOUR:
- nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_ADDR_UNREACH:
- nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_PORT_UNREACH:
- nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_ECHOREPLY:
/* Do nothing */
break;
case IP6T_TCP_RESET:
- nf_send_reset6(net, skb, par->hooknum);
+ nf_send_reset6(net, skb, xt_hooknum(par));
break;
case IP6T_ICMP6_POLICY_FAIL:
- nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
break;
case IP6T_ICMP6_REJECT_ROUTE:
- nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE,
+ xt_hooknum(par));
break;
}
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 06bed74cf5ee..98c8dd38575a 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -277,12 +277,12 @@ static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
- if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
return NF_DROP;
th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -440,12 +440,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
e->ipv6.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg6_reg __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 1ee1b25df096..b12e61b7b16c 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -72,10 +72,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
return ret;
}
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
{
- const struct rt6_info *rt = (const void *) skb_dst(skb);
- return rt && (rt->rt6i_flags & RTF_LOCAL);
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
}
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
struct ipv6hdr *iph;
bool invert = info->flags & XT_RPFILTER_INVERT;
- if (rpfilter_is_local(skb))
+ if (rpfilter_is_loopback(skb, xt_in(par)))
return true ^ invert;
iph = ipv6_hdr(skb);
@@ -93,7 +93,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (unlikely(saddrtype == IPV6_ADDR_ANY))
return true ^ invert; /* not routable: forward path will drop it */
- return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse6(xt_net(par), skb, xt_in(par),
+ info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 1aa5848764a7..4e3402486833 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -34,6 +34,13 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_log.h>
+static int conntrack6_net_id;
+static DEFINE_MUTEX(register_ipv6_hooks);
+
+struct conntrack6_net {
+ unsigned int users;
+};
+
static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -115,7 +122,7 @@ static unsigned int ipv6_helper(void *priv,
help = nfct_help(ct);
if (!help)
return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
@@ -308,6 +315,42 @@ static int ipv6_nlattr_tuple_size(void)
}
#endif
+static int ipv6_hooks_register(struct net *net)
+{
+ struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv6_hooks);
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv6_enable(net);
+ if (err < 0) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv6_hooks);
+ return err;
+}
+
+static void ipv6_hooks_unregister(struct net *net)
+{
+ struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
+
+ mutex_lock(&register_ipv6_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ mutex_unlock(&register_ipv6_hooks);
+}
+
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.l3proto = PF_INET6,
.name = "ipv6",
@@ -321,6 +364,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.nlattr_to_tuple = ipv6_nlattr_to_tuple,
.nla_policy = ipv6_nla_policy,
#endif
+ .net_ns_get = ipv6_hooks_register,
+ .net_ns_put = ipv6_hooks_unregister,
.me = THIS_MODULE,
};
@@ -336,52 +381,51 @@ static struct nf_sockopt_ops so_getorigdst6 = {
.owner = THIS_MODULE,
};
+static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
+ &nf_conntrack_l4proto_tcp6,
+ &nf_conntrack_l4proto_udp6,
+ &nf_conntrack_l4proto_icmpv6,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite6,
+#endif
+};
+
static int ipv6_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6);
- if (ret < 0) {
- pr_err("nf_conntrack_tcp6: pernet registration failed\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6);
- if (ret < 0) {
- pr_err("nf_conntrack_udp6: pernet registration failed\n");
- goto cleanup_tcp6;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6);
- if (ret < 0) {
- pr_err("nf_conntrack_icmp6: pernet registration failed\n");
- goto cleanup_udp6;
- }
+ ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
+ if (ret < 0)
+ return ret;
+
ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6);
if (ret < 0) {
pr_err("nf_conntrack_ipv6: pernet registration failed.\n");
- goto cleanup_icmpv6;
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
}
- return 0;
- cleanup_icmpv6:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
- cleanup_udp6:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
- cleanup_tcp6:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
- out:
return ret;
}
static void ipv6_net_exit(struct net *net)
{
nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
}
static struct pernet_operations ipv6_net_ops = {
.init = ipv6_net_init,
.exit = ipv6_net_exit,
+ .id = &conntrack6_net_id,
+ .size = sizeof(struct conntrack6_net),
};
static int __init nf_conntrack_l3proto_ipv6_init(void)
@@ -389,7 +433,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv6_enable();
ret = nf_register_sockopt(&so_getorigdst6);
if (ret < 0) {
@@ -401,47 +444,20 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
if (ret < 0)
goto cleanup_sockopt;
- ret = nf_register_hooks(ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register pre-routing defrag "
- "hook.\n");
+ ret = nf_ct_l4proto_register(builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
+ if (ret < 0)
goto cleanup_pernet;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n");
- goto cleanup_hooks;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n");
- goto cleanup_tcp6;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n");
- goto cleanup_udp6;
- }
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
if (ret < 0) {
pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
- goto cleanup_icmpv6;
+ goto cleanup_l4proto;
}
return ret;
-
- cleanup_icmpv6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
- cleanup_udp6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
- cleanup_tcp6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- cleanup_hooks:
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+cleanup_l4proto:
+ nf_ct_l4proto_unregister(builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
cleanup_pernet:
unregister_pernet_subsys(&ipv6_net_ops);
cleanup_sockopt:
@@ -453,10 +469,8 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
{
synchronize_net();
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+ nf_ct_l4proto_unregister(builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
unregister_pernet_subsys(&ipv6_net_ops);
nf_unregister_sockopt(&so_getorigdst6);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 660bc10c7a9c..f5a61bc3ec2b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -165,7 +165,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
return -NF_ACCEPT;
}
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e4347aeb2e65..9948b5ce52da 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -576,11 +576,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
/* Jumbo payload inhibits frag. header */
if (ipv6_hdr(skb)->payload_len == 0) {
pr_debug("payload len = 0\n");
- return -EINVAL;
+ return 0;
}
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
- return -EINVAL;
+ return 0;
if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
return -ENOMEM;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f7aab5ab93a5..8e0bdd058787 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -30,6 +30,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+static DEFINE_MUTEX(defrag6_mutex);
+
static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
struct sk_buff *skb)
{
@@ -69,7 +71,7 @@ static unsigned int ipv6_defrag(void *priv,
if (err == -EINPROGRESS)
return NF_STOLEN;
- return NF_ACCEPT;
+ return err == 0 ? NF_ACCEPT : NF_DROP;
}
static struct nf_hook_ops ipv6_defrag_ops[] = {
@@ -87,6 +89,19 @@ static struct nf_hook_ops ipv6_defrag_ops[] = {
},
};
+static void __net_exit defrag6_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv6) {
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ net->nf.defrag_ipv6 = false;
+ }
+}
+
+static struct pernet_operations defrag6_net_ops = {
+ .exit = defrag6_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
int ret = 0;
@@ -96,9 +111,9 @@ static int __init nf_defrag_init(void)
pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
return ret;
}
- ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ ret = register_pernet_subsys(&defrag6_net_ops);
if (ret < 0) {
- pr_err("nf_defrag_ipv6: can't register hooks\n");
+ pr_err("nf_defrag_ipv6: can't register pernet ops\n");
goto cleanup_frag6;
}
return ret;
@@ -111,12 +126,31 @@ cleanup_frag6:
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ unregister_pernet_subsys(&defrag6_net_ops);
nf_ct_frag6_cleanup();
}
-void nf_defrag_ipv6_enable(void)
+int nf_defrag_ipv6_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv6)
+ return 0;
+
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv6 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag6_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 8dd869642f45..57d86066a13b 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
else
- logflags = NF_LOG_MASK;
+ logflags = NF_LOG_DEFAULT_MASK;
ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
if (ih == NULL) {
@@ -84,7 +84,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
}
/* Max length: 48 "OPT (...) " */
- if (logflags & XT_LOG_IPOPT)
+ if (logflags & NF_LOG_IPOPT)
nf_log_buf_add(m, "OPT ( ");
switch (currenthdr) {
@@ -121,7 +121,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
if (fragment) {
- if (logflags & XT_LOG_IPOPT)
+ if (logflags & NF_LOG_IPOPT)
nf_log_buf_add(m, ")");
return;
}
@@ -129,7 +129,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
break;
/* Max Length */
case IPPROTO_AH:
- if (logflags & XT_LOG_IPOPT) {
+ if (logflags & NF_LOG_IPOPT) {
struct ip_auth_hdr _ahdr;
const struct ip_auth_hdr *ah;
@@ -161,7 +161,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
hdrlen = (hp->hdrlen+2)<<2;
break;
case IPPROTO_ESP:
- if (logflags & XT_LOG_IPOPT) {
+ if (logflags & NF_LOG_IPOPT) {
struct ip_esp_hdr _esph;
const struct ip_esp_hdr *eh;
@@ -194,7 +194,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
return;
}
- if (logflags & XT_LOG_IPOPT)
+ if (logflags & NF_LOG_IPOPT)
nf_log_buf_add(m, ") ");
currenthdr = hp->nexthdr;
@@ -277,7 +277,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
}
/* Max length: 15 "UID=4294967295 " */
- if ((logflags & XT_LOG_UID) && recurse)
+ if ((logflags & NF_LOG_UID) && recurse)
nf_log_dump_sk_uid_gid(m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -295,7 +295,7 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
- if (!(logflags & XT_LOG_MACDECODE))
+ if (!(logflags & NF_LOG_MACDECODE))
goto fallback;
switch (dev->type) {
@@ -379,8 +379,7 @@ static struct nf_logger nf_ip6_logger __read_mostly = {
static int __net_init nf_log_ipv6_net_init(struct net *net)
{
- nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
- return 0;
+ return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
}
static void __net_exit nf_log_ipv6_net_exit(struct net *net)
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index a5400223fd74..eedee5d108d9 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -156,6 +156,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.daddr = oip6h->saddr;
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
+ fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
@@ -179,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
skb_dst_set(nskb, dst);
+ nskb->mark = fl6.flowi6_mark;
+
skb_reserve(nskb, hh_len + dst->header_len);
ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
ip6_dst_hoplimit(dst));
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
new file mode 100644
index 000000000000..ebb2bf84232a
--- /dev/null
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+ unsigned int outside_hdrlen,
+ int *protocol,
+ const struct in6_addr **raddr,
+ const struct in6_addr **laddr,
+ __be16 *rport,
+ __be16 *lport,
+ struct ipv6hdr *ipv6_var)
+{
+ const struct ipv6hdr *inside_iph;
+ struct icmp6hdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+ u8 inside_nexthdr;
+ __be16 inside_fragoff;
+ int inside_hdrlen;
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
+ sizeof(*ipv6_var), ipv6_var);
+ if (inside_iph == NULL)
+ return 1;
+ inside_nexthdr = inside_iph->nexthdr;
+
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
+ sizeof(*ipv6_var),
+ &inside_nexthdr, &inside_fragoff);
+ if (inside_hdrlen < 0)
+ return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+ if (inside_nexthdr != IPPROTO_TCP &&
+ inside_nexthdr != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, inside_hdrlen,
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_nexthdr;
+ *laddr = &inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = &inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
+ const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet6_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ const struct in6_addr *daddr = NULL, *saddr = NULL;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ int doff = 0;
+ int thoff = 0, tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NULL;
+ }
+
+ if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ saddr = &iph->saddr;
+ sport = hp->source;
+ daddr = &iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = tproto == IPPROTO_TCP ?
+ thoff + __tcp_hdrlen((struct tcphdr *)hp) :
+ thoff + sizeof(*hp);
+
+ } else if (tproto == IPPROTO_ICMPV6) {
+ struct ipv6hdr ipv6_var;
+
+ if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+ &sport, &dport, &ipv6_var))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+ return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
+ sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv6 socket lookup infrastructure");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 30b22f4dff55..d6e4ba5de916 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
{
struct nft_pktinfo pkt;
- /* malformed packet, drop it */
- if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
- return NF_DROP;
+ nft_set_pktinfo_ipv6(&pkt, skb, state);
return nft_do_chain(&pkt, priv);
}
@@ -102,7 +100,10 @@ static int __init nf_tables_ipv6_init(void)
{
int ret;
- nft_register_chain_type(&filter_ipv6);
+ ret = nft_register_chain_type(&filter_ipv6);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_ipv6);
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 2535223ba956..f2727475895e 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -33,9 +33,7 @@ static unsigned int nf_route_table_hook(void *priv,
u32 mark, flowlabel;
int err;
- /* malformed packet, drop it */
- if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
- return NF_DROP;
+ nft_set_pktinfo_ipv6(&pkt, skb, state);
/* save source/dest address, mark, hoplimit, flowlabel, priority */
memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 8bfd470cbe72..d8b5b60b7d53 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -26,9 +26,9 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr,
{
struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
- int oif = regs->data[priv->sreg_dev];
+ int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
- nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif);
+ nf_dup_ipv6(nft_net(pkt), pkt->skb, nft_hook(pkt), gw, oif);
}
static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
@@ -57,7 +57,9 @@ static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
- if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr))
+ goto nla_put_failure;
+ if (priv->sreg_dev &&
nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
goto nla_put_failure;
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
new file mode 100644
index 000000000000..765facf03d45
--- /dev/null
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -0,0 +1,270 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+static int get_ifindex(const struct net_device *dev)
+{
+ return dev ? dev->ifindex : 0;
+}
+
+static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
+ const struct nft_pktinfo *pkt,
+ const struct net_device *dev)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(pkt->skb);
+ int lookup_flags = 0;
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl6->daddr = iph->daddr;
+ fl6->saddr = iph->saddr;
+ } else {
+ fl6->daddr = iph->saddr;
+ fl6->saddr = iph->daddr;
+ }
+
+ if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev);
+ }
+
+ if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST)
+ lookup_flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl6->flowi6_mark = pkt->skb->mark;
+
+ fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
+
+ return lookup_flags;
+}
+
+static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
+ const struct nft_pktinfo *pkt)
+{
+ const struct net_device *dev = NULL;
+ const struct nf_ipv6_ops *v6ops;
+ const struct nf_afinfo *afinfo;
+ int route_err, addrtype;
+ struct rt6_info *rt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_proto = pkt->tprot,
+ };
+ u32 ret = 0;
+
+ afinfo = nf_get_afinfo(NFPROTO_IPV6);
+ if (!afinfo)
+ return RTN_UNREACHABLE;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ nft_fib6_flowi_init(&fl6, priv, pkt, dev);
+
+ v6ops = nf_get_ipv6_ops();
+ if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
+ ret = RTN_LOCAL;
+
+ route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt,
+ flowi6_to_flowi(&fl6), false);
+ if (route_err)
+ goto err;
+
+ if (rt->rt6i_flags & RTF_REJECT) {
+ route_err = rt->dst.error;
+ dst_release(&rt->dst);
+ goto err;
+ }
+
+ if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr))
+ ret = RTN_ANYCAST;
+ else if (!dev && rt->rt6i_flags & RTF_LOCAL)
+ ret = RTN_LOCAL;
+
+ dst_release(&rt->dst);
+
+ if (ret)
+ return ret;
+
+ addrtype = ipv6_addr_type(&fl6.daddr);
+
+ if (addrtype & IPV6_ADDR_MULTICAST)
+ return RTN_MULTICAST;
+ if (addrtype & IPV6_ADDR_UNICAST)
+ return RTN_UNICAST;
+
+ return RTN_UNSPEC;
+ err:
+ switch (route_err) {
+ case -EINVAL:
+ return RTN_BLACKHOLE;
+ case -EACCES:
+ return RTN_PROHIBIT;
+ case -EAGAIN:
+ return RTN_THROW;
+ default:
+ break;
+ }
+
+ return RTN_UNREACHABLE;
+}
+
+void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+
+ *dest = __nft_fib6_eval_type(priv, pkt);
+}
+EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
+
+void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ const struct net_device *oif = NULL;
+ u32 *dest = &regs->data[priv->dreg];
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_proto = pkt->tprot,
+ };
+ struct rt6_info *rt;
+ int lookup_flags;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+
+ lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif);
+
+ if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+ nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ nft_in(pkt)->ifindex);
+ return;
+ }
+
+ *dest = 0;
+ again:
+ rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags);
+ if (rt->dst.error)
+ goto put_rt_err;
+
+ /* Should not see RTF_LOCAL here */
+ if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
+ goto put_rt_err;
+
+ if (oif && oif != rt->rt6i_idev->dev) {
+ /* multipath route? Try again with F_IFACE */
+ if ((lookup_flags & RT6_LOOKUP_F_IFACE) == 0) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6.flowi6_oif = oif->ifindex;
+ ip6_rt_put(rt);
+ goto again;
+ }
+ }
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ *dest = rt->rt6i_idev->dev->ifindex;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ put_rt_err:
+ ip6_rt_put(rt);
+}
+EXPORT_SYMBOL_GPL(nft_fib6_eval);
+
+static struct nft_expr_type nft_fib6_type;
+
+static const struct nft_expr_ops nft_fib6_type_ops = {
+ .type = &nft_fib6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib6_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops nft_fib6_ops = {
+ .type = &nft_fib6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib6_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops *
+nft_fib6_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib6_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib6_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib6_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib6_type __read_mostly = {
+ .name = "fib",
+ .select_ops = &nft_fib6_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib6_module_init(void)
+{
+ return nft_register_expr(&nft_fib6_type);
+}
+
+static void __exit nft_fib6_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib6_type);
+}
+module_init(nft_fib6_module_init);
+module_exit(nft_fib6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(10, "fib");
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 9597ffb74077..6c5b5b1830a7 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,14 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
range.max_proto.all =
*(__be16 *)&regs->data[priv->sreg_proto_max];
}
- regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
+ regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
+ nft_out(pkt));
+}
+
+static void
+nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
}
static struct nft_expr_type nft_masq_ipv6_type;
@@ -41,6 +48,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv6_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -77,5 +85,5 @@ module_init(nft_masq_ipv6_module_init);
module_exit(nft_masq_ipv6_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index aca44e89a881..f5ac080fc084 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -35,7 +35,14 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
range.flags |= priv->flags;
- regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->hook);
+ regs->verdict.code =
+ nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
+}
+
+static void
+nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
}
static struct nft_expr_type nft_redir_ipv6_type;
@@ -44,6 +51,7 @@ static const struct nft_expr_ops nft_redir_ipv6_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv6_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -71,5 +79,5 @@ module_init(nft_redir_ipv6_module_init);
module_exit(nft_redir_ipv6_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 92bda9908bb9..057deeaff1cb 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -27,11 +27,11 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
- pkt->hook);
+ nf_send_unreach6(nft_net(pkt), pkt->skb, priv->icmp_code,
+ nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 462f2a76b5c2..cd4252346a32 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -148,6 +148,15 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
ipv6_hdr(skb)->payload_len = htons(len);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
+ skb->protocol = htons(ETH_P_IPV6);
+
return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 0e983b694ee8..e1f8b34d7a2e 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -113,6 +113,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.daddr = *daddr;
fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
@@ -180,7 +181,7 @@ struct proto pingv6_prot = {
.init = ping_init_sock,
.close = ping_close,
.connect = ip6_datagram_connect_v6_only,
- .disconnect = udp_disconnect,
+ .disconnect = __udp_disconnect,
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
.sendmsg = ping_v6_sendmsg,
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 679253d0af84..cc8e3ae9ca73 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -30,6 +30,11 @@
#include <net/transp_v6.h>
#include <net/ipv6.h>
+#define MAX4(a, b, c, d) \
+ max_t(u32, max_t(u32, a, b), max_t(u32, c, d))
+#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
+ IPSTATS_MIB_MAX, ICMP_MIB_MAX)
+
static int sockstat6_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq->private;
@@ -191,25 +196,34 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
atomic_long_t *smib,
const struct snmp_mib *itemlist)
{
+ unsigned long buff[SNMP_MIB_MAX];
int i;
- unsigned long val;
- for (i = 0; itemlist[i].name; i++) {
- val = pcpumib ?
- snmp_fold_field(pcpumib, itemlist[i].entry) :
- atomic_long_read(smib + itemlist[i].entry);
- seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val);
+ if (pcpumib) {
+ memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+
+ snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
+ for (i = 0; itemlist[i].name; i++)
+ seq_printf(seq, "%-32s\t%lu\n",
+ itemlist[i].name, buff[i]);
+ } else {
+ for (i = 0; itemlist[i].name; i++)
+ seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
+ atomic_long_read(smib + itemlist[i].entry));
}
}
static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
const struct snmp_mib *itemlist, size_t syncpoff)
{
+ u64 buff64[SNMP_MIB_MAX];
int i;
+ memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+
+ snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
for (i = 0; itemlist[i].name; i++)
- seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name,
- snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
+ seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
}
static int snmp6_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 590dd1f7746f..ea89073c8247 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -65,11 +65,12 @@
#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
-static struct raw_hashinfo raw_v6_hashinfo = {
+struct raw_hashinfo raw_v6_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
};
+EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
-static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
unsigned short num, const struct in6_addr *loc_addr,
const struct in6_addr *rmt_addr, int dif)
{
@@ -102,6 +103,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
found:
return sk;
}
+EXPORT_SYMBOL_GPL(__raw_v6_lookup);
/*
* 0 - deliver
@@ -589,7 +591,11 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
}
offset += skb_transport_offset(skb);
- BUG_ON(skb_copy_bits(skb, offset, &csum, 2));
+ err = skb_copy_bits(skb, offset, &csum, 2);
+ if (err < 0) {
+ ip6_flush_pending_frames(sk);
+ goto out;
+ }
/* in case cksum was not initialized */
if (unlikely(csum))
@@ -653,6 +659,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
if (err)
goto error_fault;
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
NULL, rt->dst.dev, dst_output);
@@ -767,6 +780,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
@@ -1234,7 +1248,7 @@ struct proto rawv6_prot = {
.close = rawv6_close,
.destroy = raw6_destroy,
.connect = ip6_datagram_connect_v6_only,
- .disconnect = udp_disconnect,
+ .disconnect = __udp_disconnect,
.ioctl = rawv6_ioctl,
.init = rawv6_init_sk,
.setsockopt = rawv6_setsockopt,
@@ -1252,6 +1266,7 @@ struct proto rawv6_prot = {
.compat_getsockopt = compat_rawv6_getsockopt,
.compat_ioctl = compat_rawv6_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2160d5d009cb..e1da5b888cc4 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -211,7 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
{
struct sk_buff *prev, *next;
struct net_device *dev;
- int offset, end;
+ int offset, end, fragsize;
struct net *net = dev_net(skb_dst(skb)->dev);
u8 ecn;
@@ -336,6 +336,10 @@ found:
fq->ecn |= ecn;
add_frag_mem_limit(fq->q.net, skb->truesize);
+ fragsize = -skb_network_offset(skb) + skb->len;
+ if (fragsize > fq->q.max_size)
+ fq->q.max_size = fragsize;
+
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
*/
@@ -456,7 +460,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
skb_network_header(head)[nhoff] = skb_transport_header(head)[0];
memmove(head->head + sizeof(struct frag_hdr), head->head,
(head->data - head->head) - sizeof(struct frag_hdr));
- head->mac_header += sizeof(struct frag_hdr);
+ if (skb_mac_header_was_set(head))
+ head->mac_header += sizeof(struct frag_hdr);
head->network_header += sizeof(struct frag_hdr);
skb_reset_transport_header(head);
@@ -494,6 +499,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
IP6CB(head)->nhoff = nhoff;
IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(head)->frag_max_size = fq->q.max_size;
/* Yes, and fold redundant checksum back. 8) */
skb_postpush_rcsum(head, skb_network_header(head),
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 269218aacbea..7ea85370c11c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -64,7 +64,7 @@
#include <net/l3mdev.h>
#include <trace/events/fib6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -102,11 +102,13 @@ static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct rt6_info *rt6_add_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex,
+ const struct in6_addr *gwaddr,
+ struct net_device *dev,
unsigned int pref);
static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex);
+ const struct in6_addr *gwaddr,
+ struct net_device *dev);
#endif
struct uncached_list {
@@ -525,7 +527,7 @@ static void rt6_probe_deferred(struct work_struct *w)
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
- ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
dev_put(work->dev);
kfree(work);
}
@@ -656,7 +658,8 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
struct net_device *dev = rt->dst.dev;
if (dev && !netif_carrier_ok(dev) &&
- idev->cnf.ignore_routes_with_linkdown)
+ idev->cnf.ignore_routes_with_linkdown &&
+ !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
if (rt6_check_expired(rt))
@@ -803,7 +806,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
rt = rt6_get_dflt_router(gwaddr, dev);
else
rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
- gwaddr, dev->ifindex);
+ gwaddr, dev);
if (rt && !lifetime) {
ip6_del_rt(rt);
@@ -811,8 +814,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
}
if (!rt && lifetime)
- rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
- pref);
+ rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
+ dev, pref);
else if (rt)
rt->rt6i_flags = RTF_ROUTEINFO |
(rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
@@ -1050,6 +1053,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int strict = 0;
strict |= flags & RT6_LOOKUP_F_IFACE;
+ strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
@@ -1147,15 +1151,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
}
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
- struct net_device *dev,
- struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+ struct net_device *dev,
+ struct flowi6 *fl6, int flags)
{
if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
void ip6_route_input(struct sk_buff *skb)
{
@@ -1164,7 +1169,7 @@ void ip6_route_input(struct sk_buff *skb)
int flags = RT6_LOOKUP_F_HAS_SADDR;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
- .flowi6_iif = l3mdev_fib_oif(skb->dev),
+ .flowi6_iif = skb->dev->ifindex,
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
@@ -1188,12 +1193,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
struct flowi6 *fl6, int flags)
{
- struct dst_entry *dst;
bool any_src;
- dst = l3mdev_get_rt6_dst(net, fl6);
- if (dst)
- return dst;
+ if (rt6_need_strict(&fl6->daddr)) {
+ struct dst_entry *dst;
+
+ dst = l3mdev_link_scope_lookup(net, fl6);
+ if (dst)
+ return dst;
+ }
fl6->flowi6_iif = LOOPBACK_IFINDEX;
@@ -1356,6 +1364,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (rt6->rt6i_flags & RTF_LOCAL)
return;
+ if (dst_metric_locked(dst, RTAX_MTU))
+ return;
+
dst_confirm(dst);
mtu = max_t(u32, mtu, IPV6_MIN_MTU);
if (mtu >= dst_mtu(dst))
@@ -1397,7 +1408,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
- int oif, u32 mark)
+ int oif, u32 mark, kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1409,6 +1420,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
@@ -1422,7 +1434,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
struct dst_entry *dst;
ip6_update_pmtu(skb, sock_net(sk), mtu,
- sk->sk_bound_dev_if, sk->sk_mark);
+ sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
@@ -1452,7 +1464,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_node *fn;
/* Get the "current" route for this destination and
- * check if the redirect has come from approriate router.
+ * check if the redirect has come from appropriate router.
*
* RFC 4861 specifies that redirects should only be
* accepted if they come from the nexthop to the target.
@@ -1514,7 +1526,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
flags, __ip6_route_redirect);
}
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1527,6 +1540,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1548,6 +1562,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
fl6.flowi6_mark = mark;
fl6.daddr = msg->dest;
fl6.saddr = iph->daddr;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
dst = ip6_route_redirect(net, &fl6, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1556,7 +1571,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
+ sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
@@ -1604,7 +1620,9 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
rcu_read_unlock();
out:
- return min_t(unsigned int, mtu, IP6_MAX_MTU);
+ mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
static struct dst_entry *icmp6_dst_gc_list;
@@ -1783,7 +1801,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
};
struct fib6_table *table;
struct rt6_info *rt;
- int flags = RT6_LOOKUP_F_IFACE;
+ int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
table = fib6_get_table(net, cfg->fc_table);
if (!table)
@@ -1982,8 +2000,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
It is very good, but in some (rare!) circumstances
(SIT, PtP, NBMA NOARP links) it is handy to allow
some exceptions. --ANK
+ We allow IPv4-mapped nexthops to support RFC4798-type
+ addressing
*/
- if (!(gwa_type & IPV6_ADDR_UNICAST))
+ if (!(gwa_type & (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_MAPPED)))
goto out;
if (cfg->fc_table) {
@@ -2153,6 +2174,8 @@ static int ip6_route_del(struct fib6_config *cfg)
continue;
if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
continue;
+ if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
+ continue;
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
@@ -2319,13 +2342,16 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex)
+ const struct in6_addr *gwaddr,
+ struct net_device *dev)
{
+ u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
+ int ifindex = dev->ifindex;
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
- table = fib6_get_table(net, RT6_TABLE_INFO);
+ table = fib6_get_table(net, tb_id);
if (!table)
return NULL;
@@ -2351,12 +2377,13 @@ out:
static struct rt6_info *rt6_add_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex,
+ const struct in6_addr *gwaddr,
+ struct net_device *dev,
unsigned int pref)
{
struct fib6_config cfg = {
.fc_metric = IP6_RT_PRIO_USER,
- .fc_ifindex = ifindex,
+ .fc_ifindex = dev->ifindex,
.fc_dst_len = prefixlen,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
RTF_UP | RTF_PREF(pref),
@@ -2365,7 +2392,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
- cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
+ cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -2375,16 +2402,17 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
ip6_route_add(&cfg);
- return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
+ return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
}
#endif
struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
{
+ u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
struct rt6_info *rt;
struct fib6_table *table;
- table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
+ table = fib6_get_table(dev_net(dev), tb_id);
if (!table)
return NULL;
@@ -2418,20 +2446,20 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
cfg.fc_gateway = *gwaddr;
- ip6_route_add(&cfg);
+ if (!ip6_route_add(&cfg)) {
+ struct fib6_table *table;
+
+ table = fib6_get_table(dev_net(dev), cfg.fc_table);
+ if (table)
+ table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
+ }
return rt6_get_dflt_router(gwaddr, dev);
}
-void rt6_purge_dflt_routers(struct net *net)
+static void __rt6_purge_dflt_routers(struct fib6_table *table)
{
struct rt6_info *rt;
- struct fib6_table *table;
-
- /* NOTE: Keep consistent with rt6_get_dflt_router */
- table = fib6_get_table(net, RT6_TABLE_DFLT);
- if (!table)
- return;
restart:
read_lock_bh(&table->tb6_lock);
@@ -2445,6 +2473,27 @@ restart:
}
}
read_unlock_bh(&table->tb6_lock);
+
+ table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
+}
+
+void rt6_purge_dflt_routers(struct net *net)
+{
+ struct fib6_table *table;
+ struct hlist_head *head;
+ unsigned int h;
+
+ rcu_read_lock();
+
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
+ __rt6_purge_dflt_routers(table);
+ }
+ }
+
+ rcu_read_unlock();
}
static void rtmsg_to_fib6_config(struct net *net,
@@ -2565,8 +2614,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
{
u32 tb_id;
struct net *net = dev_net(idev->dev);
- struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
- DST_NOCOUNT);
+ struct net_device *dev = net->loopback_dev;
+ struct rt6_info *rt;
+
+ /* use L3 Master device as loopback for host routes if device
+ * is enslaved and address is not link local or multicast
+ */
+ if (!rt6_need_strict(addr))
+ dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
+
+ rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
if (!rt)
return ERR_PTR(-ENOMEM);
@@ -2711,9 +2768,10 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
old MTU is the lowest MTU in the path, update the route PMTU
to reflect the increase. In this case if the other nodes' MTU
also have the lowest MTU, TOO BIG MESSAGE will be lead to
- PMTU discouvery.
+ PMTU discovery.
*/
if (rt->dst.dev == arg->dev &&
+ dst_metric_raw(&rt->dst, RTAX_MTU) &&
!dst_metric_locked(&rt->dst, RTAX_MTU)) {
if (rt->rt6i_flags & RTF_CACHE) {
/* For RTF_CACHE with rt6i_pmtu == 0
@@ -2753,6 +2811,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
[RTA_EXPIRES] = { .type = NLA_U32 },
+ [RTA_UID] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2837,6 +2896,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_MULTIPATH]) {
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+
+ err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
+ cfg->fc_mp_len);
+ if (err < 0)
+ goto errout;
}
if (tb[RTA_PREF]) {
@@ -2850,9 +2914,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_ENCAP])
cfg->fc_encap = tb[RTA_ENCAP];
- if (tb[RTA_ENCAP_TYPE])
+ if (tb[RTA_ENCAP_TYPE]) {
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+ if (err < 0)
+ goto errout;
+ }
+
if (tb[RTA_EXPIRES]) {
unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
@@ -3258,7 +3327,8 @@ static int rt6_fill_node(struct net *net,
if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
goto nla_put_failure;
- lwtunnel_fill_encap(skb, rt->dst.lwtstate);
+ if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -3327,6 +3397,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (tb[RTA_MARK])
fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+ if (tb[RTA_UID])
+ fl6.flowi6_uid = make_kuid(current_user_ns(),
+ nla_get_u32(tb[RTA_UID]));
+ else
+ fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
if (iif) {
struct net_device *dev;
int flags = 0;
@@ -3347,11 +3423,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
} else {
fl6.flowi6_oif = oif;
- if (netif_index_is_l3_master(net, oif)) {
- fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
- FLOWI_FLAG_SKIP_NH_OIF;
- }
-
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
new file mode 100644
index 000000000000..a855eb325b03
--- /dev/null
+++ b/net/ipv6/seg6.c
@@ -0,0 +1,497 @@
+/*
+ * SR-IPv6 implementation
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <linux/seg6.h>
+#include <linux/seg6_genl.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
+{
+ int trailing;
+ unsigned int tlv_offset;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (((srh->hdrlen + 1) << 3) != len)
+ return false;
+
+ if (srh->segments_left != srh->first_segment)
+ return false;
+
+ tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
+
+ trailing = len - tlv_offset;
+ if (trailing < 0)
+ return false;
+
+ while (trailing) {
+ struct sr6_tlv *tlv;
+ unsigned int tlv_len;
+
+ tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
+ tlv_len = sizeof(*tlv) + tlv->len;
+
+ trailing -= tlv_len;
+ if (trailing < 0)
+ return false;
+
+ tlv_offset += tlv_len;
+ }
+
+ return true;
+}
+
+static struct genl_family seg6_genl_family;
+
+static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
+ [SEG6_ATTR_DST] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, },
+ [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, },
+ [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, },
+ [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, },
+ [SEG6_ATTR_ALGID] = { .type = NLA_U8, },
+ [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, },
+};
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct seg6_pernet_data *sdata;
+ struct seg6_hmac_info *hinfo;
+ u32 hmackeyid;
+ char *secret;
+ int err = 0;
+ u8 algid;
+ u8 slen;
+
+ sdata = seg6_pernet(net);
+
+ if (!info->attrs[SEG6_ATTR_HMACKEYID] ||
+ !info->attrs[SEG6_ATTR_SECRETLEN] ||
+ !info->attrs[SEG6_ATTR_ALGID])
+ return -EINVAL;
+
+ hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]);
+ slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]);
+ algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]);
+
+ if (hmackeyid == 0)
+ return -EINVAL;
+
+ if (slen > SEG6_HMAC_SECRET_LEN)
+ return -EINVAL;
+
+ mutex_lock(&sdata->lock);
+ hinfo = seg6_hmac_info_lookup(net, hmackeyid);
+
+ if (!slen) {
+ if (!hinfo)
+ err = -ENOENT;
+
+ err = seg6_hmac_info_del(net, hmackeyid);
+
+ goto out_unlock;
+ }
+
+ if (!info->attrs[SEG6_ATTR_SECRET]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (hinfo) {
+ err = seg6_hmac_info_del(net, hmackeyid);
+ if (err)
+ goto out_unlock;
+ }
+
+ secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]);
+
+ hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL);
+ if (!hinfo) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(hinfo->secret, secret, slen);
+ hinfo->slen = slen;
+ hinfo->alg_id = algid;
+ hinfo->hmackeyid = hmackeyid;
+
+ err = seg6_hmac_info_add(net, hmackeyid, hinfo);
+ if (err)
+ kfree(hinfo);
+
+out_unlock:
+ mutex_unlock(&sdata->lock);
+ return err;
+}
+
+#else
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+#endif
+
+static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct in6_addr *val, *t_old, *t_new;
+ struct seg6_pernet_data *sdata;
+
+ sdata = seg6_pernet(net);
+
+ if (!info->attrs[SEG6_ATTR_DST])
+ return -EINVAL;
+
+ val = nla_data(info->attrs[SEG6_ATTR_DST]);
+ t_new = kmemdup(val, sizeof(*val), GFP_KERNEL);
+ if (!t_new)
+ return -ENOMEM;
+
+ mutex_lock(&sdata->lock);
+
+ t_old = sdata->tun_src;
+ rcu_assign_pointer(sdata->tun_src, t_new);
+
+ mutex_unlock(&sdata->lock);
+
+ synchronize_net();
+ kfree(t_old);
+
+ return 0;
+}
+
+static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct in6_addr *tun_src;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC);
+ if (!hdr)
+ goto free_msg;
+
+ rcu_read_lock();
+ tun_src = rcu_dereference(seg6_pernet(net)->tun_src);
+
+ if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src))
+ goto nla_put_failure;
+
+ rcu_read_unlock();
+
+ genlmsg_end(msg, hdr);
+ genlmsg_reply(msg, info);
+
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+
+static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo,
+ struct sk_buff *msg)
+{
+ if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) ||
+ nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) ||
+ nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) ||
+ nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id))
+ return -1;
+
+ return 0;
+}
+
+static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (__seg6_hmac_fill_info(hinfo, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct seg6_pernet_data *sdata;
+ struct rhashtable_iter *iter;
+
+ sdata = seg6_pernet(net);
+ iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&sdata->hmac_infos, iter);
+
+ return 0;
+}
+
+static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+
+ kfree(iter);
+
+ return 0;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct seg6_pernet_data *sdata;
+ struct seg6_hmac_info *hinfo;
+ int ret;
+
+ sdata = seg6_pernet(net);
+
+ ret = rhashtable_walk_start(iter);
+ if (ret && ret != -EAGAIN)
+ goto done;
+
+ for (;;) {
+ hinfo = rhashtable_walk_next(iter);
+
+ if (IS_ERR(hinfo)) {
+ if (PTR_ERR(hinfo) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(hinfo);
+ goto done;
+ } else if (!hinfo) {
+ break;
+ }
+
+ ret = __seg6_genl_dumphmac_element(hinfo,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb, SEG6_CMD_DUMPHMAC);
+ if (ret)
+ goto done;
+ }
+
+ ret = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return ret;
+}
+
+#else
+
+static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return -ENOTSUPP;
+}
+
+#endif
+
+static int __net_init seg6_net_init(struct net *net)
+{
+ struct seg6_pernet_data *sdata;
+
+ sdata = kzalloc(sizeof(*sdata), GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+
+ mutex_init(&sdata->lock);
+
+ sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL);
+ if (!sdata->tun_src) {
+ kfree(sdata);
+ return -ENOMEM;
+ }
+
+ net->ipv6.seg6_data = sdata;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ seg6_hmac_net_init(net);
+#endif
+
+ return 0;
+}
+
+static void __net_exit seg6_net_exit(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ seg6_hmac_net_exit(net);
+#endif
+
+ kfree(sdata->tun_src);
+ kfree(sdata);
+}
+
+static struct pernet_operations ip6_segments_ops = {
+ .init = seg6_net_init,
+ .exit = seg6_net_exit,
+};
+
+static const struct genl_ops seg6_genl_ops[] = {
+ {
+ .cmd = SEG6_CMD_SETHMAC,
+ .doit = seg6_genl_sethmac,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_DUMPHMAC,
+ .start = seg6_genl_dumphmac_start,
+ .dumpit = seg6_genl_dumphmac,
+ .done = seg6_genl_dumphmac_done,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_SET_TUNSRC,
+ .doit = seg6_genl_set_tunsrc,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_GET_TUNSRC,
+ .doit = seg6_genl_get_tunsrc,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static struct genl_family seg6_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SEG6_GENL_NAME,
+ .version = SEG6_GENL_VERSION,
+ .maxattr = SEG6_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = seg6_genl_ops,
+ .n_ops = ARRAY_SIZE(seg6_genl_ops),
+ .module = THIS_MODULE,
+};
+
+int __init seg6_init(void)
+{
+ int err = -ENOMEM;
+
+ err = genl_register_family(&seg6_genl_family);
+ if (err)
+ goto out;
+
+ err = register_pernet_subsys(&ip6_segments_ops);
+ if (err)
+ goto out_unregister_genl;
+
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ err = seg6_iptunnel_init();
+ if (err)
+ goto out_unregister_pernet;
+#endif
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ err = seg6_hmac_init();
+ if (err)
+ goto out_unregister_iptun;
+#endif
+
+ pr_info("Segment Routing with IPv6\n");
+
+out:
+ return err;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+out_unregister_iptun:
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ seg6_iptunnel_exit();
+#endif
+#endif
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+out_unregister_pernet:
+ unregister_pernet_subsys(&ip6_segments_ops);
+#endif
+out_unregister_genl:
+ genl_unregister_family(&seg6_genl_family);
+ goto out;
+}
+
+void seg6_exit(void)
+{
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ seg6_hmac_exit();
+#endif
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ seg6_iptunnel_exit();
+#endif
+ unregister_pernet_subsys(&ip6_segments_ops);
+ genl_unregister_family(&seg6_genl_family);
+}
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
new file mode 100644
index 000000000000..6ef3dfb6e811
--- /dev/null
+++ b/net/ipv6/seg6_hmac.c
@@ -0,0 +1,484 @@
+/*
+ * SR-IPv6 implementation -- HMAC functions
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+
+#include <linux/cryptohash.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <net/seg6_hmac.h>
+#include <linux/random.h>
+
+static char * __percpu *hmac_ring;
+
+static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct seg6_hmac_info *hinfo = obj;
+
+ return (hinfo->hmackeyid != *(__u32 *)arg->key);
+}
+
+static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo)
+{
+ kfree_rcu(hinfo, rcu);
+}
+
+static void seg6_free_hi(void *ptr, void *arg)
+{
+ struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr;
+
+ if (hinfo)
+ seg6_hinfo_release(hinfo);
+}
+
+static const struct rhashtable_params rht_params = {
+ .head_offset = offsetof(struct seg6_hmac_info, node),
+ .key_offset = offsetof(struct seg6_hmac_info, hmackeyid),
+ .key_len = sizeof(u32),
+ .automatic_shrinking = true,
+ .obj_cmpfn = seg6_hmac_cmpfn,
+};
+
+static struct seg6_hmac_algo hmac_algos[] = {
+ {
+ .alg_id = SEG6_HMAC_ALGO_SHA1,
+ .name = "hmac(sha1)",
+ },
+ {
+ .alg_id = SEG6_HMAC_ALGO_SHA256,
+ .name = "hmac(sha256)",
+ },
+};
+
+static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
+{
+ struct sr6_tlv_hmac *tlv;
+
+ if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5)
+ return NULL;
+
+ if (!sr_has_hmac(srh))
+ return NULL;
+
+ tlv = (struct sr6_tlv_hmac *)
+ ((char *)srh + ((srh->hdrlen + 1) << 3) - 40);
+
+ if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38)
+ return NULL;
+
+ return tlv;
+}
+
+static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
+{
+ struct seg6_hmac_algo *algo;
+ int i, alg_count;
+
+ alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+ for (i = 0; i < alg_count; i++) {
+ algo = &hmac_algos[i];
+ if (algo->alg_id == alg_id)
+ return algo;
+ }
+
+ return NULL;
+}
+
+static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
+ u8 *output, int outlen)
+{
+ struct seg6_hmac_algo *algo;
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+ int ret, dgsize;
+
+ algo = __hmac_get_algo(hinfo->alg_id);
+ if (!algo)
+ return -ENOENT;
+
+ tfm = *this_cpu_ptr(algo->tfms);
+
+ dgsize = crypto_shash_digestsize(tfm);
+ if (dgsize > outlen) {
+ pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
+ dgsize, outlen);
+ return -ENOMEM;
+ }
+
+ ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
+ if (ret < 0) {
+ pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
+ goto failed;
+ }
+
+ shash = *this_cpu_ptr(algo->shashs);
+ shash->tfm = tfm;
+
+ ret = crypto_shash_digest(shash, text, psize, output);
+ if (ret < 0) {
+ pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
+ goto failed;
+ }
+
+ return dgsize;
+
+failed:
+ return ret;
+}
+
+int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
+ struct in6_addr *saddr, u8 *output)
+{
+ __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
+ u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
+ int plen, i, dgsize, wrsize;
+ char *ring, *off;
+
+ /* a 160-byte buffer for digest output allows to store highest known
+ * hash function (RadioGatun) with up to 1216 bits
+ */
+
+ /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
+ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
+
+ /* this limit allows for 14 segments */
+ if (plen >= SEG6_HMAC_RING_SIZE)
+ return -EMSGSIZE;
+
+ /* Let's build the HMAC text on the ring buffer. The text is composed
+ * as follows, in order:
+ *
+ * 1. Source IPv6 address (128 bits)
+ * 2. first_segment value (8 bits)
+ * 3. Flags (8 bits)
+ * 4. HMAC Key ID (32 bits)
+ * 5. All segments in the segments list (n * 128 bits)
+ */
+
+ local_bh_disable();
+ ring = *this_cpu_ptr(hmac_ring);
+ off = ring;
+
+ /* source address */
+ memcpy(off, saddr, 16);
+ off += 16;
+
+ /* first_segment value */
+ *off++ = hdr->first_segment;
+
+ /* flags */
+ *off++ = hdr->flags;
+
+ /* HMAC Key ID */
+ memcpy(off, &hmackeyid, 4);
+ off += 4;
+
+ /* all segments in the list */
+ for (i = 0; i < hdr->first_segment + 1; i++) {
+ memcpy(off, hdr->segments + i, 16);
+ off += 16;
+ }
+
+ dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
+ SEG6_HMAC_MAX_DIGESTSIZE);
+ local_bh_enable();
+
+ if (dgsize < 0)
+ return dgsize;
+
+ wrsize = SEG6_HMAC_FIELD_LEN;
+ if (wrsize > dgsize)
+ wrsize = dgsize;
+
+ memset(output, 0, SEG6_HMAC_FIELD_LEN);
+ memcpy(output, tmp_out, wrsize);
+
+ return 0;
+}
+EXPORT_SYMBOL(seg6_hmac_compute);
+
+/* checks if an incoming SR-enabled packet's HMAC status matches
+ * the incoming policy.
+ *
+ * called with rcu_read_lock()
+ */
+bool seg6_hmac_validate_skb(struct sk_buff *skb)
+{
+ u8 hmac_output[SEG6_HMAC_FIELD_LEN];
+ struct net *net = dev_net(skb->dev);
+ struct seg6_hmac_info *hinfo;
+ struct sr6_tlv_hmac *tlv;
+ struct ipv6_sr_hdr *srh;
+ struct inet6_dev *idev;
+
+ idev = __in6_dev_get(skb->dev);
+
+ srh = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ tlv = seg6_get_tlv_hmac(srh);
+
+ /* mandatory check but no tlv */
+ if (idev->cnf.seg6_require_hmac > 0 && !tlv)
+ return false;
+
+ /* no check */
+ if (idev->cnf.seg6_require_hmac < 0)
+ return true;
+
+ /* check only if present */
+ if (idev->cnf.seg6_require_hmac == 0 && !tlv)
+ return true;
+
+ /* now, seg6_require_hmac >= 0 && tlv */
+
+ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+ if (!hinfo)
+ return false;
+
+ if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
+ return false;
+
+ if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL(seg6_hmac_validate_skb);
+
+/* called with rcu_read_lock() */
+struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct seg6_hmac_info *hinfo;
+
+ hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+
+ return hinfo;
+}
+EXPORT_SYMBOL(seg6_hmac_info_lookup);
+
+int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ int err;
+
+ err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
+ rht_params);
+
+ return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_add);
+
+int seg6_hmac_info_del(struct net *net, u32 key)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct seg6_hmac_info *hinfo;
+ int err = -ENOENT;
+
+ hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+ if (!hinfo)
+ goto out;
+
+ err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node,
+ rht_params);
+ if (err)
+ goto out;
+
+ seg6_hinfo_release(hinfo);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_del);
+
+int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+ struct ipv6_sr_hdr *srh)
+{
+ struct seg6_hmac_info *hinfo;
+ struct sr6_tlv_hmac *tlv;
+ int err = -ENOENT;
+
+ tlv = seg6_get_tlv_hmac(srh);
+ if (!tlv)
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+ if (!hinfo)
+ goto out;
+
+ memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN);
+ err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(seg6_push_hmac);
+
+static int seg6_hmac_init_ring(void)
+{
+ int i;
+
+ hmac_ring = alloc_percpu(char *);
+
+ if (!hmac_ring)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL);
+
+ if (!ring)
+ return -ENOMEM;
+
+ *per_cpu_ptr(hmac_ring, i) = ring;
+ }
+
+ return 0;
+}
+
+static int seg6_hmac_init_algo(void)
+{
+ struct seg6_hmac_algo *algo;
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+ int i, alg_count, cpu;
+
+ alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+
+ for (i = 0; i < alg_count; i++) {
+ struct crypto_shash **p_tfm;
+ int shsize;
+
+ algo = &hmac_algos[i];
+ algo->tfms = alloc_percpu(struct crypto_shash *);
+ if (!algo->tfms)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+ p_tfm = per_cpu_ptr(algo->tfms, cpu);
+ *p_tfm = tfm;
+ }
+
+ p_tfm = raw_cpu_ptr(algo->tfms);
+ tfm = *p_tfm;
+
+ shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
+
+ algo->shashs = alloc_percpu(struct shash_desc *);
+ if (!algo->shashs)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ shash = kzalloc(shsize, GFP_KERNEL);
+ if (!shash)
+ return -ENOMEM;
+ *per_cpu_ptr(algo->shashs, cpu) = shash;
+ }
+ }
+
+ return 0;
+}
+
+int __init seg6_hmac_init(void)
+{
+ int ret;
+
+ ret = seg6_hmac_init_ring();
+ if (ret < 0)
+ goto out;
+
+ ret = seg6_hmac_init_algo();
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(seg6_hmac_init);
+
+int __net_init seg6_hmac_net_init(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ rhashtable_init(&sdata->hmac_infos, &rht_params);
+
+ return 0;
+}
+EXPORT_SYMBOL(seg6_hmac_net_init);
+
+void seg6_hmac_exit(void)
+{
+ struct seg6_hmac_algo *algo = NULL;
+ int i, alg_count, cpu;
+
+ for_each_possible_cpu(i) {
+ char *ring = *per_cpu_ptr(hmac_ring, i);
+
+ kfree(ring);
+ }
+ free_percpu(hmac_ring);
+
+ alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+ for (i = 0; i < alg_count; i++) {
+ algo = &hmac_algos[i];
+ for_each_possible_cpu(cpu) {
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+
+ shash = *per_cpu_ptr(algo->shashs, cpu);
+ kfree(shash);
+ tfm = *per_cpu_ptr(algo->tfms, cpu);
+ crypto_free_shash(tfm);
+ }
+ free_percpu(algo->tfms);
+ free_percpu(algo->shashs);
+ }
+}
+EXPORT_SYMBOL(seg6_hmac_exit);
+
+void __net_exit seg6_hmac_net_exit(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL);
+}
+EXPORT_SYMBOL(seg6_hmac_net_exit);
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
new file mode 100644
index 000000000000..c46f8cbf5ab5
--- /dev/null
+++ b/net/ipv6/seg6_iptunnel.c
@@ -0,0 +1,436 @@
+/*
+ * SR-IPv6 implementation
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_iptunnel.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#ifdef CONFIG_DST_CACHE
+#include <net/dst_cache.h>
+#endif
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+struct seg6_lwt {
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache cache;
+#endif
+ struct seg6_iptunnel_encap tuninfo[0];
+};
+
+static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct seg6_lwt *)lwt->data;
+}
+
+static inline struct seg6_iptunnel_encap *
+seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return seg6_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
+ [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+int nla_put_srh(struct sk_buff *skb, int attrtype,
+ struct seg6_iptunnel_encap *tuninfo)
+{
+ struct seg6_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo, len);
+
+ return 0;
+}
+
+static void set_tun_src(struct net *net, struct net_device *dev,
+ struct in6_addr *daddr, struct in6_addr *saddr)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct in6_addr *tun_src;
+
+ rcu_read_lock();
+
+ tun_src = rcu_dereference(sdata->tun_src);
+
+ if (!ipv6_addr_any(tun_src)) {
+ memcpy(saddr, tun_src, sizeof(struct in6_addr));
+ } else {
+ ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
+ saddr);
+ }
+
+ rcu_read_unlock();
+}
+
+/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
+static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ipv6hdr *hdr, *inner_hdr;
+ struct ipv6_sr_hdr *isrh;
+ int hdrlen, tot_len, err;
+
+ hdrlen = (osrh->hdrlen + 1) << 3;
+ tot_len = hdrlen + sizeof(*hdr);
+
+ err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+
+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ hdr = ipv6_hdr(skb);
+
+ /* inherit tc, flowlabel and hlim
+ * hlim will be decremented in ip6_forward() afterwards and
+ * decapsulation will overwrite inner hlim with outer hlim
+ */
+ ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+ ip6_flowlabel(inner_hdr));
+ hdr->hop_limit = inner_hdr->hop_limit;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, osrh, hdrlen);
+
+ isrh->nexthdr = NEXTHDR_IPV6;
+
+ hdr->daddr = isrh->segments[isrh->first_segment];
+ set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(isrh)) {
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+ skb_postpush_rcsum(skb, hdr, tot_len);
+
+ return 0;
+}
+
+/* insert an SRH within an IPv6 packet, just after the IPv6 header */
+#ifdef CONFIG_IPV6_SEG6_INLINE
+static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+ struct ipv6hdr *hdr, *oldhdr;
+ struct ipv6_sr_hdr *isrh;
+ int hdrlen, err;
+
+ hdrlen = (osrh->hdrlen + 1) << 3;
+
+ err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+
+ memmove(hdr, oldhdr, sizeof(*hdr));
+
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, osrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+
+ isrh->segments[0] = hdr->daddr;
+ hdr->daddr = isrh->segments[isrh->first_segment];
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(isrh)) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ return 0;
+}
+#endif
+
+static int seg6_do_srh(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct seg6_iptunnel_encap *tinfo;
+ int err = 0;
+
+ tinfo = seg6_encap_lwtunnel(dst->lwtstate);
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ switch (tinfo->mode) {
+#ifdef CONFIG_IPV6_SEG6_INLINE
+ case SEG6_IPTUN_MODE_INLINE:
+ err = seg6_do_srh_inline(skb, tinfo->srh);
+ skb_reset_inner_headers(skb);
+ break;
+#endif
+ case SEG6_IPTUN_MODE_ENCAP:
+ err = seg6_do_srh_encap(skb, tinfo->srh);
+ break;
+ }
+
+ if (err)
+ return err;
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_set_inner_protocol(skb, skb->protocol);
+
+ return 0;
+}
+
+int seg6_input(struct sk_buff *skb)
+{
+ int err;
+
+ err = seg6_do_srh(skb);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ ip6_route_input(skb);
+
+ return dst_input(skb);
+}
+
+int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct seg6_lwt *slwt;
+ int err = -EINVAL;
+
+ err = seg6_do_srh(skb);
+ if (unlikely(err))
+ goto drop;
+
+ slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
+
+#ifdef CONFIG_DST_CACHE
+ preempt_disable();
+ dst = dst_cache_get(&slwt->cache);
+ preempt_enable();
+#endif
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ dst_release(dst);
+ goto drop;
+ }
+
+#ifdef CONFIG_DST_CACHE
+ preempt_disable();
+ dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
+ preempt_enable();
+#endif
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ return dst_output(net, sk, skb);
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int seg6_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
+ struct seg6_iptunnel_encap *tuninfo;
+ struct lwtunnel_state *newts;
+ int tuninfo_len, min_size;
+ struct seg6_lwt *slwt;
+ int err;
+
+ err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
+ seg6_iptunnel_policy);
+
+ if (err < 0)
+ return err;
+
+ if (!tb[SEG6_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
+ tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
+
+ /* tuninfo must contain at least the iptunnel encap structure,
+ * the SRH and one segment
+ */
+ min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
+ sizeof(struct in6_addr);
+ if (tuninfo_len < min_size)
+ return -EINVAL;
+
+ switch (tuninfo->mode) {
+#ifdef CONFIG_IPV6_SEG6_INLINE
+ case SEG6_IPTUN_MODE_INLINE:
+ break;
+#endif
+ case SEG6_IPTUN_MODE_ENCAP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* verify that SRH is consistent */
+ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
+ if (!newts)
+ return -ENOMEM;
+
+ slwt = seg6_lwt_lwtunnel(newts);
+
+#ifdef CONFIG_DST_CACHE
+ err = dst_cache_init(&slwt->cache, GFP_KERNEL);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+#endif
+
+ memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
+
+ newts->type = LWTUNNEL_ENCAP_SEG6;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+ LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->headroom = seg6_lwt_headroom(tuninfo);
+
+ *ts = newts;
+
+ return 0;
+}
+
+#ifdef CONFIG_DST_CACHE
+static void seg6_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
+}
+#endif
+
+static int seg6_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+ if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
+}
+
+static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
+ struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
+ int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
+
+ if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops seg6_iptun_ops = {
+ .build_state = seg6_build_state,
+#ifdef CONFIG_DST_CACHE
+ .destroy_state = seg6_destroy_state,
+#endif
+ .output = seg6_output,
+ .input = seg6_input,
+ .fill_encap = seg6_fill_encap_info,
+ .get_encap_size = seg6_encap_nlsize,
+ .cmp_encap = seg6_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init seg6_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
+
+void seg6_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 182b6a9be29d..99853c6e33a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -31,7 +31,7 @@
#include <linux/if_arp.h>
#include <linux/icmp.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -62,7 +62,7 @@
For comments look at net/ipv4/ip_gre.c --ANK
*/
-#define HASH_SIZE 16
+#define IP6_SIT_HASH_SIZE 16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
static bool log_ecn_error = true;
@@ -76,11 +76,11 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
__be32 *v4dst);
static struct rtnl_link_ops sit_link_ops __read_mostly;
-static int sit_net_id __read_mostly;
+static unsigned int sit_net_id __read_mostly;
struct sit_net {
- struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE];
struct ip_tunnel __rcu *tunnels_wc[1];
struct ip_tunnel __rcu **tunnels[4];
@@ -1126,7 +1126,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
}
#endif
-bool ipip6_valid_ip_proto(u8 ipproto)
+static bool ipip6_valid_ip_proto(u8 ipproto)
{
return ipproto == IPPROTO_IPV6 ||
ipproto == IPPROTO_IPIP ||
@@ -1318,23 +1318,11 @@ done:
return err;
}
-static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- int t_hlen = tunnel->hlen + sizeof(struct iphdr);
-
- if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - t_hlen)
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
static const struct net_device_ops ipip6_netdev_ops = {
.ndo_init = ipip6_tunnel_init,
.ndo_uninit = ipip6_tunnel_uninit,
.ndo_start_xmit = sit_tunnel_xmit,
.ndo_do_ioctl = ipip6_tunnel_ioctl,
- .ndo_change_mtu = ipip6_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
};
@@ -1365,6 +1353,8 @@ static void ipip6_tunnel_setup(struct net_device *dev)
dev->type = ARPHRD_SIT;
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
+ dev->min_mtu = IPV6_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - t_hlen;
dev->flags = IFF_NOARP;
netif_keep_dst(dev);
dev->addr_len = 4;
@@ -1390,6 +1380,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (err) {
free_percpu(dev->tstats);
+ dev->tstats = NULL;
return err;
}
@@ -1783,7 +1774,7 @@ static void __net_exit sit_destroy_tunnels(struct net *net,
for (prio = 1; prio < 4; prio++) {
int h;
- for (h = 0; h < HASH_SIZE; h++) {
+ for (h = 0; h < IP6_SIT_HASH_SIZE; h++) {
struct ip_tunnel *t;
t = rtnl_dereference(sitn->tunnels[prio][h]);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 59c483937aec..a4d49760bf43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -209,6 +209,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
treq->snt_synack.v64 = 0;
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
/*
* We need to lookup the dst_entry to get the correct window size.
@@ -227,6 +228,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 94f4f89d73e7..4c60c6f71cd3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
}
}
-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -148,8 +148,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
* connect() to INADDR_ANY means loopback (BSD'ism).
*/
- if (ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 0x1;
+ if (ipv6_addr_any(&usin->sin6_addr)) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ &usin->sin6_addr);
+ else
+ usin->sin6_addr = in6addr_loopback;
+ }
addr_type = ipv6_addr_type(&usin->sin6_addr);
@@ -188,7 +193,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
* TCP over IPv4
*/
- if (addr_type == IPV6_ADDR_MAPPED) {
+ if (addr_type & IPV6_ADDR_MAPPED) {
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
@@ -233,6 +238,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
@@ -282,7 +288,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
- inet->inet_dport);
+ inet->inet_dport,
+ &tp->tsoffset);
err = tcp_connect(sk);
if (err)
@@ -397,7 +404,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
- &tp->tsq_flags))
+ &sk->sk_tsq_flags))
sock_hold(sk);
goto out;
}
@@ -467,7 +474,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, opt, np->tclass);
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -671,6 +678,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
genhash ? "failed" : "mismatch",
&ip6h->saddr, ntohs(th->source),
@@ -827,6 +835,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
+ fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
@@ -836,7 +845,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -953,7 +962,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
+ req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
0, 0);
}
@@ -986,6 +996,16 @@ drop:
return 0; /* don't send reset */
}
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+ /* We need to move header back to the beginning if xfrm6_policy_check()
+ * and tcp_v6_fill_cb() are going to be called again.
+ * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
+ */
+ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+ sizeof(struct inet6_skb_parm));
+}
+
static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
@@ -1177,8 +1197,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
sk_gfp_mask(sk, GFP_ATOMIC));
consume_skb(ireq->pktopts);
ireq->pktopts = NULL;
- if (newnp->pktoptions)
+ if (newnp->pktoptions) {
+ tcp_v6_restore_cb(newnp->pktoptions);
skb_set_owner_r(newnp->pktoptions, newsk);
+ }
}
}
@@ -1218,7 +1240,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
- if (sk_filter(sk, skb))
+ if (tcp_filter(sk, skb))
goto discard;
/*
@@ -1322,6 +1344,7 @@ ipv6_pktoptions:
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
skb_set_owner_r(opt_skb, sk);
+ tcp_v6_restore_cb(opt_skb);
opt_skb = xchg(&np->pktoptions, opt_skb);
} else {
__kfree_skb(opt_skb);
@@ -1355,15 +1378,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
TCP_SKB_CB(skb)->sacked = 0;
}
-static void tcp_v6_restore_cb(struct sk_buff *skb)
-{
- /* We need to move header back to the beginning if xfrm6_policy_check()
- * and tcp_v6_fill_cb() are going to be called again.
- */
- memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
- sizeof(struct inet6_skb_parm));
-}
-
static int tcp_v6_rcv(struct sk_buff *skb)
{
const struct tcphdr *th;
@@ -1415,6 +1429,7 @@ process:
sk = req->rsk_listener;
tcp_v6_fill_cb(skb, hdr, th);
if (tcp_v6_inbound_md5_hash(sk, skb)) {
+ sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1453,8 +1468,10 @@ process:
if (tcp_v6_inbound_md5_hash(sk, skb))
goto discard_and_relse;
- if (sk_filter(sk, skb))
+ if (tcp_filter(sk, skb))
goto discard_and_relse;
+ th = (const struct tcphdr *)skb->data;
+ hdr = ipv6_hdr(skb);
skb->dev = NULL;
@@ -1471,10 +1488,7 @@ process:
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
ret = tcp_v6_do_rcv(sk, skb);
- } else if (unlikely(sk_add_backlog(sk, skb,
- sk->sk_rcvbuf + sk->sk_sndbuf))) {
- bh_unlock_sock(sk);
- __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+ } else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
}
bh_unlock_sock(sk);
@@ -1868,17 +1882,6 @@ void tcp6_proc_exit(struct net *net)
}
#endif
-static void tcp_v6_clear_sk(struct sock *sk, int size)
-{
- struct inet_sock *inet = inet_sk(sk);
-
- /* we do not want to clear pinet6 field, because of RCU lookups */
- sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
-
- size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
- memset(&inet->pinet6 + 1, 0, size);
-}
-
struct proto tcpv6_prot = {
.name = "TCPv6",
.owner = THIS_MODULE,
@@ -1920,7 +1923,6 @@ struct proto tcpv6_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
- .clear_sk = tcp_v6_clear_sk,
.diag_destroy = tcp_abort,
};
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 19ac3a1c308d..221825a9407a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -35,7 +35,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/addrconf.h>
#include <net/ndisc.h>
@@ -302,7 +302,8 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
+ IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
@@ -334,7 +335,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
int is_udp4;
- bool slow;
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
@@ -344,8 +344,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
try_again:
peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
@@ -364,7 +363,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -378,7 +378,6 @@ try_again:
goto csum_copy_err;
}
if (unlikely(err)) {
- trace_kfree_skb(skb, udpv6_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
if (is_udp4)
@@ -388,7 +387,7 @@ try_again:
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
is_udplite);
}
- skb_free_datagram_locked(sk, skb);
+ kfree_skb(skb);
return err;
}
if (!peeked) {
@@ -427,7 +426,8 @@ try_again:
if (is_udp4) {
if (inet->cmsg_flags)
- ip_cmsg_recv(msg, skb);
+ ip_cmsg_recv_offset(msg, sk, skb,
+ sizeof(struct udphdr), off);
} else {
if (np->rxopt.all)
ip6_datagram_recv_specific_ctl(sk, msg, skb);
@@ -437,12 +437,11 @@ try_again:
if (flags & MSG_TRUNC)
err = ulen;
- __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+ skb_consume_udp(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
- slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags)) {
+ if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
if (is_udp4) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_CSUMERRORS, is_udplite);
@@ -455,7 +454,7 @@ csum_copy_err:
UDP_MIB_INERRORS, is_udplite);
}
}
- unlock_sock_fast(sk, slow);
+ kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -513,7 +512,7 @@ out:
return;
}
-static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -521,9 +520,11 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
}
- rc = __sock_queue_rcv_skb(sk, skb);
+ rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -535,6 +536,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
return -1;
}
+
return 0;
}
@@ -556,7 +558,6 @@ EXPORT_SYMBOL(udpv6_encap_enable);
int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -622,25 +623,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
udp_csum_pull_header(skb);
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- __UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_RCVBUFERRORS, is_udplite);
- goto drop;
- }
skb_dst_drop(skb);
- bh_lock_sock(sk);
- rc = 0;
- if (!sock_owned_by_user(sk))
- rc = __udpv6_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
- bh_unlock_sock(sk);
- goto drop;
- }
- bh_unlock_sock(sk);
-
- return rc;
+ return __udpv6_queue_rcv_skb(sk, skb);
csum_error:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -705,10 +691,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
if (use_hash2) {
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
- udp_table.mask;
- hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+ udptable->mask;
+ hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udp_table.hash2[hash2];
+ hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
@@ -1047,6 +1033,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
daddr = &sin6->sin6_addr;
+ if (ipv6_addr_any(daddr) &&
+ ipv6_addr_v4mapped(&np->saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ daddr);
break;
case AF_INET:
goto do_udp_sendmsg;
@@ -1155,6 +1145,7 @@ do_udp_sendmsg:
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
sockc.tsflags = sk->sk_tsflags;
if (msg->msg_controllen) {
@@ -1424,17 +1415,6 @@ void udp6_proc_exit(struct net *net)
}
#endif /* CONFIG_PROC_FS */
-void udp_v6_clear_sk(struct sock *sk, int size)
-{
- struct inet_sock *inet = inet_sk(sk);
-
- /* we do not want to clear pinet6 field, because of RCU lookups */
- sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6));
-
- size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
- memset(&inet->pinet6 + 1, 0, size);
-}
-
/* ------------------------------------------------------------------------ */
struct proto udpv6_prot = {
@@ -1444,12 +1424,12 @@ struct proto udpv6_prot = {
.connect = ip6_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
+ .init = udp_init_sock,
.destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
- .backlog_rcv = __udpv6_queue_rcv_skb,
.release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
@@ -1465,7 +1445,7 @@ struct proto udpv6_prot = {
.compat_setsockopt = compat_udpv6_setsockopt,
.compat_getsockopt = compat_udpv6_getsockopt,
#endif
- .clear_sk = udp_v6_clear_sk,
+ .diag_destroy = udp_abort,
};
static struct inet_protosw udpv6_protosw = {
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 0682c031ccdc..e78bdc76dcc3 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,11 +26,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
-int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udpv6_destroy_sock(struct sock *sk);
-void udp_v6_clear_sk(struct sock *sk, int size);
-
#ifdef CONFIG_PROC_FS
int udp6_seq_show(struct seq_file *seq, void *v);
#endif
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index fd6ef414899b..2784cc363f2b 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,17 +45,17 @@ struct proto udplitev6_prot = {
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
- .backlog_rcv = udpv6_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v6_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp6_sock),
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt,
.compat_getsockopt = compat_udpv6_getsockopt,
#endif
- .clear_sk = udp_v6_clear_sk,
};
static struct inet_protosw udplite6_protosw = {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 70a86adad875..e0f71c01d728 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
nexthdr = nh[nhoff];
if (skb_dst(skb))
- oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl6, 0, sizeof(struct flowi6));
fl6->flowi6_mark = skb->mark;
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e1c0bbe7996c..d7b731a78d09 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -44,7 +44,7 @@ struct xfrm6_tunnel_net {
u32 spi;
};
-static int xfrm6_tunnel_net_id __read_mostly;
+static unsigned int xfrm6_tunnel_net_id __read_mostly;
static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
{
return net_generic(net, xfrm6_tunnel_net_id);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 48d0dc89b58d..8a9219ff2e77 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -56,7 +56,7 @@
#include <net/tcp_states.h>
#include <net/net_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* Configuration Variables */
static unsigned char ipxcfg_max_hops = 16;
@@ -1809,7 +1809,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
if (rc)
goto out_free;
- if (skb->tstamp.tv64)
+ if (skb->tstamp)
sk->sk_stamp = skb->tstamp;
if (sipx) {
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ccc244406fb9..ab254041dab7 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -52,7 +52,7 @@
#include <linux/poll.h>
#include <asm/ioctls.h> /* TIOCOUTQ, TIOCINQ */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -845,9 +845,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
if (sock->state != SS_UNCONNECTED)
goto out;
- if ((sk = sock->sk) == NULL)
- goto out;
-
err = -EOPNOTSUPP;
if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
(sk->sk_type != SOCK_DGRAM))
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 873c4b707d6a..817b1b186aff 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -40,7 +40,7 @@
#include <linux/interrupt.h>
#include <linux/device.h> /* for MODULE_ALIAS_CHARDEV_MAJOR */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/irda/irda.h>
#include <net/irda/irmod.h>
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 8f5678cb6263..f18070118d05 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -32,7 +32,7 @@
#include <linux/tty.h>
#include <linux/serial.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/irda/irda.h>
#include <net/irda/irmod.h>
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index 856736656a30..890b90d055d5 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -43,7 +43,7 @@
#include <linux/export.h>
#include <asm/ioctls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/dma.h>
#include <asm/io.h>
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index d8b7267280c3..74d09f91709e 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -51,7 +51,6 @@ static const struct net_device_ops irlan_eth_netdev_ops = {
.ndo_stop = irlan_eth_close,
.ndo_start_xmit = irlan_eth_xmit,
.ndo_set_rx_mode = irlan_eth_set_multicast_list,
- .ndo_change_mtu = eth_change_mtu,
.ndo_validate_addr = eth_validate_addr,
};
@@ -67,7 +66,8 @@ static void irlan_eth_setup(struct net_device *dev)
dev->netdev_ops = &irlan_eth_netdev_ops;
dev->destructor = free_netdev;
-
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
/*
* Lets do all queueing in IrTTP instead of this device driver.
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 8d65bb9477fc..9d451f8ed47a 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -245,12 +245,11 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/netdevice.h>
-#include <linux/miscdevice.h>
#include <linux/poll.h>
#include <linux/capability.h>
#include <linux/ctype.h> /* isspace() */
#include <linux/string.h> /* skip_spaces() */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <linux/ppp_defs.h>
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
index 940225866da0..32061442cc8e 100644
--- a/net/irda/irnet/irnet_ppp.h
+++ b/net/irda/irnet/irnet_ppp.h
@@ -15,13 +15,10 @@
/***************************** INCLUDES *****************************/
#include "irnet.h" /* Module global include */
+#include <linux/miscdevice.h>
/************************ CONSTANTS & MACROS ************************/
-/* /dev/irnet file constants */
-#define IRNET_MAJOR 10 /* Misc range */
-#define IRNET_MINOR 187 /* Official allocation */
-
/* IrNET control channel stuff */
#define IRNET_MAX_COMMAND 256 /* Max length of a command line */
@@ -111,9 +108,9 @@ static const struct file_operations irnet_device_fops =
/* Structure so that the misc major (drivers/char/misc.c) take care of us... */
static struct miscdevice irnet_misc_device =
{
- IRNET_MINOR,
- "irnet",
- &irnet_device_fops
+ .minor = IRNET_MINOR,
+ .name = "irnet",
+ .fops = &irnet_device_fops
};
#endif /* IRNET_PPP_H */
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index e15c40e86660..7fc340e574cf 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -24,13 +24,7 @@
-static struct genl_family irda_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = IRDA_NL_NAME,
- .hdrsize = 0,
- .version = IRDA_NL_VERSION,
- .maxattr = IRDA_NL_CMD_MAX,
-};
+static struct genl_family irda_nl_family;
static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
{
@@ -147,9 +141,19 @@ static const struct genl_ops irda_nl_ops[] = {
};
-int irda_nl_register(void)
+static struct genl_family irda_nl_family __ro_after_init = {
+ .name = IRDA_NL_NAME,
+ .hdrsize = 0,
+ .version = IRDA_NL_VERSION,
+ .maxattr = IRDA_NL_CMD_MAX,
+ .module = THIS_MODULE,
+ .ops = irda_nl_ops,
+ .n_ops = ARRAY_SIZE(irda_nl_ops),
+};
+
+int __init irda_nl_register(void)
{
- return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
+ return genl_register_family(&irda_nl_family);
}
void irda_nl_unregister(void)
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
index b9ac598e2116..77cfdde9d82f 100644
--- a/net/irda/irproc.c
+++ b/net/irda/irproc.c
@@ -23,7 +23,6 @@
*
********************************************************************/
-#include <linux/miscdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index acbe61c7e683..160dc89335e2 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new);
* for deallocating this structure if it's complex. If not the user can
* just supply kfree, which should take care of the job.
*/
-#ifdef CONFIG_LOCKDEP
-static int hashbin_lock_depth = 0;
-#endif
int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
{
irda_queue_t* queue;
@@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
/* Synchronize */
- if ( hashbin->hb_type & HB_LOCK ) {
- spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags,
- hashbin_lock_depth++);
- }
+ if (hashbin->hb_type & HB_LOCK)
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
/*
* Free the entries in the hashbin, TODO: use hashbin_clear when
* it has been shown to work
*/
for (i = 0; i < HASHBIN_SIZE; i ++ ) {
- queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
- while (queue ) {
- if (free_func)
- (*free_func)(queue);
- queue = dequeue_first(
- (irda_queue_t**) &hashbin->hb_queue[i]);
+ while (1) {
+ queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
+
+ if (!queue)
+ break;
+
+ if (free_func) {
+ if (hashbin->hb_type & HB_LOCK)
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ free_func(queue);
+ if (hashbin->hb_type & HB_LOCK)
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ }
}
}
@@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
hashbin->magic = ~HB_MAGIC;
/* Release lock */
- if ( hashbin->hb_type & HB_LOCK) {
+ if (hashbin->hb_type & HB_LOCK)
spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
-#ifdef CONFIG_LOCKDEP
- hashbin_lock_depth--;
-#endif
- }
/*
* Free the hashbin structure
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 02b45a8e8b35..13190b38f22e 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -453,19 +453,27 @@ static void iucv_sever_path(struct sock *sk, int with_user_data)
}
}
-/* Send FIN through an IUCV socket for HIPER transport */
+/* Send controlling flags through an IUCV socket for HIPER transport */
static int iucv_send_ctrl(struct sock *sk, u8 flags)
{
int err = 0;
int blen;
struct sk_buff *skb;
+ u8 shutdown = 0;
blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ /* controlling flags should be sent anyway */
+ shutdown = sk->sk_shutdown;
+ sk->sk_shutdown &= RCV_SHUTDOWN;
+ }
skb = sock_alloc_send_skb(sk, blen, 1, &err);
if (skb) {
skb_reserve(skb, blen);
err = afiucv_hs_send(NULL, sk, skb, flags);
}
+ if (shutdown)
+ sk->sk_shutdown = shutdown;
return err;
}
@@ -1036,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
- size_t headroom, linear;
+ size_t headroom = 0;
+ size_t linear;
struct sk_buff *skb;
struct iucv_message txmsg = {0};
struct cmsghdr *cmsg;
@@ -1114,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
* this is fine for SOCK_SEQPACKET (unless we want to support
* segmented records using the MSG_EOR flag), but
* for SOCK_STREAM we might want to improve it in future */
- headroom = (iucv->transport == AF_IUCV_TRANS_HIPER)
- ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0;
- if (headroom + len < PAGE_SIZE) {
+ if (iucv->transport == AF_IUCV_TRANS_HIPER) {
+ headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
linear = len;
} else {
- /* In nonlinear "classic" iucv skb,
- * reserve space for iucv_array
- */
- if (iucv->transport != AF_IUCV_TRANS_HIPER)
- headroom += sizeof(struct iucv_array) *
- (MAX_SKB_FRAGS + 1);
- linear = PAGE_SIZE - headroom;
+ if (len < PAGE_SIZE) {
+ linear = len;
+ } else {
+ /* In nonlinear "classic" iucv skb,
+ * reserve space for iucv_array
+ */
+ headroom = sizeof(struct iucv_array) *
+ (MAX_SKB_FRAGS + 1);
+ linear = PAGE_SIZE - headroom;
+ }
}
skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear,
noblock, &err, 0);
@@ -1315,8 +1326,13 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
}
IUCV_SKB_CB(skb)->offset = 0;
- if (sock_queue_rcv_skb(sk, skb))
- skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb);
+ if (sk_filter(sk, skb)) {
+ atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ kfree_skb(skb);
+ return;
+ }
+ if (__sock_queue_rcv_skb(sk, skb)) /* handle rcv queue full */
+ skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
}
/* iucv_process_message_q() - Process outstanding IUCV messages
@@ -1430,13 +1446,13 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
rskb = skb_dequeue(&iucv->backlog_skb_q);
while (rskb) {
IUCV_SKB_CB(rskb)->offset = 0;
- if (sock_queue_rcv_skb(sk, rskb)) {
+ if (__sock_queue_rcv_skb(sk, rskb)) {
+ /* handle rcv queue full */
skb_queue_head(&iucv->backlog_skb_q,
rskb);
break;
- } else {
- rskb = skb_dequeue(&iucv->backlog_skb_q);
}
+ rskb = skb_dequeue(&iucv->backlog_skb_q);
}
if (skb_queue_empty(&iucv->backlog_skb_q)) {
if (!list_empty(&iucv->message_q.list))
@@ -2116,12 +2132,17 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
skb_reset_transport_header(skb);
skb_reset_network_header(skb);
IUCV_SKB_CB(skb)->offset = 0;
+ if (sk_filter(sk, skb)) {
+ atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
spin_lock(&iucv->message_q.lock);
if (skb_queue_empty(&iucv->backlog_skb_q)) {
- if (sock_queue_rcv_skb(sk, skb)) {
+ if (__sock_queue_rcv_skb(sk, skb))
/* handle rcv queue full */
skb_queue_tail(&iucv->backlog_skb_q, skb);
- }
} else
skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
spin_unlock(&iucv->message_q.lock);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 88a2a3ba4212..8f7ef167c45a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -639,7 +639,7 @@ static void iucv_disable(void)
put_online_cpus();
}
-static void free_iucv_data(int cpu)
+static int iucv_cpu_dead(unsigned int cpu)
{
kfree(iucv_param_irq[cpu]);
iucv_param_irq[cpu] = NULL;
@@ -647,9 +647,10 @@ static void free_iucv_data(int cpu)
iucv_param[cpu] = NULL;
kfree(iucv_irq_data[cpu]);
iucv_irq_data[cpu] = NULL;
+ return 0;
}
-static int alloc_iucv_data(int cpu)
+static int iucv_cpu_prepare(unsigned int cpu)
{
/* Note: GFP_DMA used to get memory below 2G */
iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
@@ -671,58 +672,38 @@ static int alloc_iucv_data(int cpu)
return 0;
out_free:
- free_iucv_data(cpu);
+ iucv_cpu_dead(cpu);
return -ENOMEM;
}
-static int iucv_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int iucv_cpu_online(unsigned int cpu)
{
- cpumask_t cpumask;
- long cpu = (long) hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (alloc_iucv_data(cpu))
- return notifier_from_errno(-ENOMEM);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_iucv_data(cpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!iucv_path_table)
- break;
- smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (!iucv_path_table)
- break;
- cpumask_copy(&cpumask, &iucv_buffer_cpumask);
- cpumask_clear_cpu(cpu, &cpumask);
- if (cpumask_empty(&cpumask))
- /* Can't offline last IUCV enabled cpu. */
- return notifier_from_errno(-EINVAL);
- smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
- if (cpumask_empty(&iucv_irq_cpumask))
- smp_call_function_single(
- cpumask_first(&iucv_buffer_cpumask),
- iucv_allow_cpu, NULL, 1);
- break;
- }
- return NOTIFY_OK;
+ if (!iucv_path_table)
+ return 0;
+ iucv_declare_cpu(NULL);
+ return 0;
}
-static struct notifier_block __refdata iucv_cpu_notifier = {
- .notifier_call = iucv_cpu_notify,
-};
+static int iucv_cpu_down_prep(unsigned int cpu)
+{
+ cpumask_t cpumask;
+
+ if (!iucv_path_table)
+ return 0;
+
+ cpumask_copy(&cpumask, &iucv_buffer_cpumask);
+ cpumask_clear_cpu(cpu, &cpumask);
+ if (cpumask_empty(&cpumask))
+ /* Can't offline last IUCV enabled cpu. */
+ return -EINVAL;
+
+ iucv_retrieve_cpu(NULL);
+ if (!cpumask_empty(&iucv_irq_cpumask))
+ return 0;
+ smp_call_function_single(cpumask_first(&iucv_buffer_cpumask),
+ iucv_allow_cpu, NULL, 1);
+ return 0;
+}
/**
* iucv_sever_pathid
@@ -2027,6 +2008,7 @@ struct iucv_interface iucv_if = {
};
EXPORT_SYMBOL(iucv_if);
+static enum cpuhp_state iucv_online;
/**
* iucv_init
*
@@ -2035,7 +2017,6 @@ EXPORT_SYMBOL(iucv_if);
static int __init iucv_init(void)
{
int rc;
- int cpu;
if (!MACHINE_IS_VM) {
rc = -EPROTONOSUPPORT;
@@ -2054,23 +2035,19 @@ static int __init iucv_init(void)
goto out_int;
}
- cpu_notifier_register_begin();
-
- for_each_online_cpu(cpu) {
- if (alloc_iucv_data(cpu)) {
- rc = -ENOMEM;
- goto out_free;
- }
- }
- rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
+ rc = cpuhp_setup_state(CPUHP_NET_IUCV_PREPARE, "net/iucv:prepare",
+ iucv_cpu_prepare, iucv_cpu_dead);
if (rc)
- goto out_free;
-
- cpu_notifier_register_done();
+ goto out_dev;
+ rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "net/iucv:online",
+ iucv_cpu_online, iucv_cpu_down_prep);
+ if (rc < 0)
+ goto out_prep;
+ iucv_online = rc;
rc = register_reboot_notifier(&iucv_reboot_notifier);
if (rc)
- goto out_cpu;
+ goto out_remove_hp;
ASCEBC(iucv_error_no_listener, 16);
ASCEBC(iucv_error_no_memory, 16);
ASCEBC(iucv_error_pathid, 16);
@@ -2084,15 +2061,11 @@ static int __init iucv_init(void)
out_reboot:
unregister_reboot_notifier(&iucv_reboot_notifier);
-out_cpu:
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&iucv_cpu_notifier);
-out_free:
- for_each_possible_cpu(cpu)
- free_iucv_data(cpu);
-
- cpu_notifier_register_done();
-
+out_remove_hp:
+ cpuhp_remove_state(iucv_online);
+out_prep:
+ cpuhp_remove_state(CPUHP_NET_IUCV_PREPARE);
+out_dev:
root_device_unregister(iucv_root);
out_int:
unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
@@ -2110,7 +2083,6 @@ out:
static void __exit iucv_exit(void)
{
struct iucv_irq_list *p, *n;
- int cpu;
spin_lock_irq(&iucv_queue_lock);
list_for_each_entry_safe(p, n, &iucv_task_queue, list)
@@ -2119,11 +2091,9 @@ static void __exit iucv_exit(void)
kfree(p);
spin_unlock_irq(&iucv_queue_lock);
unregister_reboot_notifier(&iucv_reboot_notifier);
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&iucv_cpu_notifier);
- for_each_possible_cpu(cpu)
- free_iucv_data(cpu);
- cpu_notifier_register_done();
+
+ cpuhp_remove_state_nocalls(iucv_online);
+ cpuhp_remove_state(CPUHP_NET_IUCV_PREPARE);
root_device_unregister(iucv_root);
bus_unregister(&iucv_bus);
unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 5db94d940ecc..87fca36e6c47 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -3,6 +3,7 @@ config AF_KCM
tristate "KCM sockets"
depends on INET
select BPF_SYSCALL
+ select STREAM_PARSER
---help---
KCM (Kernel Connection Multiplexor) sockets provide a method
for multiplexing messages of a message based application
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 16c2e03bd388..bf75c9231cca 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
seq_printf(seq,
" psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
psock->index,
- psock->stats.rx_msgs,
- psock->stats.rx_bytes,
+ psock->strp.stats.rx_msgs,
+ psock->strp.stats.rx_bytes,
psock->stats.tx_msgs,
psock->stats.tx_bytes,
psock->sk->sk_receive_queue.qlen,
@@ -170,14 +170,27 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
if (psock->tx_stopped)
seq_puts(seq, "TxStop ");
- if (psock->rx_stopped)
+ if (psock->strp.rx_stopped)
seq_puts(seq, "RxStop ");
if (psock->tx_kcm)
seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
- if (psock->ready_rx_msg)
- seq_puts(seq, "RdyRx ");
+ if (!psock->strp.rx_paused && !psock->ready_rx_msg) {
+ if (psock->sk->sk_receive_queue.qlen) {
+ if (psock->strp.rx_need_bytes)
+ seq_printf(seq, "RxWait=%u ",
+ psock->strp.rx_need_bytes);
+ else
+ seq_printf(seq, "RxWait ");
+ }
+ } else {
+ if (psock->strp.rx_paused)
+ seq_puts(seq, "RxPause ");
+
+ if (psock->ready_rx_msg)
+ seq_puts(seq, "RdyRx ");
+ }
seq_puts(seq, "\n");
}
@@ -275,6 +288,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
{
struct kcm_psock_stats psock_stats;
struct kcm_mux_stats mux_stats;
+ struct strp_aggr_stats strp_stats;
struct kcm_mux *mux;
struct kcm_psock *psock;
struct net *net = seq->private;
@@ -282,20 +296,28 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
memset(&mux_stats, 0, sizeof(mux_stats));
memset(&psock_stats, 0, sizeof(psock_stats));
+ memset(&strp_stats, 0, sizeof(strp_stats));
mutex_lock(&knet->mutex);
aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats);
aggregate_psock_stats(&knet->aggregate_psock_stats,
&psock_stats);
+ aggregate_strp_stats(&knet->aggregate_strp_stats,
+ &strp_stats);
list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
spin_lock_bh(&mux->lock);
aggregate_mux_stats(&mux->stats, &mux_stats);
aggregate_psock_stats(&mux->aggregate_psock_stats,
&psock_stats);
- list_for_each_entry(psock, &mux->psocks, psock_list)
+ aggregate_strp_stats(&mux->aggregate_strp_stats,
+ &strp_stats);
+ list_for_each_entry(psock, &mux->psocks, psock_list) {
aggregate_psock_stats(&psock->stats, &psock_stats);
+ save_strp_stats(&psock->strp, &strp_stats);
+ }
+
spin_unlock_bh(&mux->lock);
}
@@ -328,7 +350,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
mux_stats.rx_ready_drops);
seq_printf(seq,
- "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+ "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
"Psock",
"RX-Msgs",
"RX-Bytes",
@@ -337,6 +359,8 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
"Reserved",
"Unreserved",
"RX-Aborts",
+ "RX-Intr",
+ "RX-Unrecov",
"RX-MemFail",
"RX-NeedMor",
"RX-BadLen",
@@ -345,20 +369,22 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
"TX-Aborts");
seq_printf(seq,
- "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
+ "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
"",
- psock_stats.rx_msgs,
- psock_stats.rx_bytes,
+ strp_stats.rx_msgs,
+ strp_stats.rx_bytes,
psock_stats.tx_msgs,
psock_stats.tx_bytes,
psock_stats.reserved,
psock_stats.unreserved,
- psock_stats.rx_aborts,
- psock_stats.rx_mem_fail,
- psock_stats.rx_need_more_hdr,
- psock_stats.rx_bad_hdr_len,
- psock_stats.rx_msg_too_big,
- psock_stats.rx_msg_timeouts,
+ strp_stats.rx_aborts,
+ strp_stats.rx_interrupted,
+ strp_stats.rx_unrecov_intr,
+ strp_stats.rx_mem_fail,
+ strp_stats.rx_need_more_hdr,
+ strp_stats.rx_bad_hdr_len,
+ strp_stats.rx_msg_too_big,
+ strp_stats.rx_msg_timeouts,
psock_stats.tx_aborts);
return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 411693288648..a646f3481240 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1,3 +1,13 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
@@ -17,7 +27,6 @@
#include <net/kcm.h>
#include <net/netns/generic.h>
#include <net/sock.h>
-#include <net/tcp.h>
#include <uapi/linux/kcm.h>
unsigned int kcm_net_id;
@@ -36,38 +45,12 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
return (struct kcm_tx_msg *)skb->cb;
}
-static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
-{
- return (struct kcm_rx_msg *)((void *)skb->cb +
- offsetof(struct qdisc_skb_cb, data));
-}
-
static void report_csk_error(struct sock *csk, int err)
{
csk->sk_err = EPIPE;
csk->sk_error_report(csk);
}
-/* Callback lock held */
-static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
- struct sk_buff *skb)
-{
- struct sock *csk = psock->sk;
-
- /* Unrecoverable error in receive */
-
- del_timer(&psock->rx_msg_timer);
-
- if (psock->rx_stopped)
- return;
-
- psock->rx_stopped = 1;
- KCM_STATS_INCR(psock->stats.rx_aborts);
-
- /* Report an error on the lower socket */
- report_csk_error(csk, err);
-}
-
static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
bool wakeup_kcm)
{
@@ -110,12 +93,13 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
struct kcm_psock *psock)
{
- KCM_STATS_ADD(mux->stats.rx_bytes,
- psock->stats.rx_bytes - psock->saved_rx_bytes);
+ STRP_STATS_ADD(mux->stats.rx_bytes,
+ psock->strp.stats.rx_bytes -
+ psock->saved_rx_bytes);
mux->stats.rx_msgs +=
- psock->stats.rx_msgs - psock->saved_rx_msgs;
- psock->saved_rx_msgs = psock->stats.rx_msgs;
- psock->saved_rx_bytes = psock->stats.rx_bytes;
+ psock->strp.stats.rx_msgs - psock->saved_rx_msgs;
+ psock->saved_rx_msgs = psock->strp.stats.rx_msgs;
+ psock->saved_rx_bytes = psock->strp.stats.rx_bytes;
}
static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
@@ -168,11 +152,11 @@ static void kcm_rcv_ready(struct kcm_sock *kcm)
*/
list_del(&psock->psock_ready_list);
psock->ready_rx_msg = NULL;
-
/* Commit clearing of ready_rx_msg for queuing work */
smp_mb();
- queue_work(kcm_wq, &psock->rx_work);
+ strp_unpause(&psock->strp);
+ strp_check_rcv(&psock->strp);
}
/* Buffer limit is okay now, add to ready list */
@@ -286,6 +270,7 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
if (list_empty(&mux->kcm_rx_waiters)) {
psock->ready_rx_msg = head;
+ strp_pause(&psock->strp);
list_add_tail(&psock->psock_ready_list,
&mux->psocks_ready);
spin_unlock_bh(&mux->rx_lock);
@@ -354,346 +339,60 @@ static void unreserve_rx_kcm(struct kcm_psock *psock,
spin_unlock_bh(&mux->rx_lock);
}
-static void kcm_start_rx_timer(struct kcm_psock *psock)
-{
- if (psock->sk->sk_rcvtimeo)
- mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo);
-}
-
-/* Macro to invoke filter function. */
-#define KCM_RUN_FILTER(prog, ctx) \
- (*prog->bpf_func)(ctx, prog->insnsi)
-
-/* Lower socket lock held */
-static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
- unsigned int orig_offset, size_t orig_len)
-{
- struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
- struct kcm_rx_msg *rxm;
- struct kcm_sock *kcm;
- struct sk_buff *head, *skb;
- size_t eaten = 0, cand_len;
- ssize_t extra;
- int err;
- bool cloned_orig = false;
-
- if (psock->ready_rx_msg)
- return 0;
-
- head = psock->rx_skb_head;
- if (head) {
- /* Message already in progress */
-
- rxm = kcm_rx_msg(head);
- if (unlikely(rxm->early_eaten)) {
- /* Already some number of bytes on the receive sock
- * data saved in rx_skb_head, just indicate they
- * are consumed.
- */
- eaten = orig_len <= rxm->early_eaten ?
- orig_len : rxm->early_eaten;
- rxm->early_eaten -= eaten;
-
- return eaten;
- }
-
- if (unlikely(orig_offset)) {
- /* Getting data with a non-zero offset when a message is
- * in progress is not expected. If it does happen, we
- * need to clone and pull since we can't deal with
- * offsets in the skbs for a message expect in the head.
- */
- orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!orig_skb) {
- KCM_STATS_INCR(psock->stats.rx_mem_fail);
- desc->error = -ENOMEM;
- return 0;
- }
- if (!pskb_pull(orig_skb, orig_offset)) {
- KCM_STATS_INCR(psock->stats.rx_mem_fail);
- kfree_skb(orig_skb);
- desc->error = -ENOMEM;
- return 0;
- }
- cloned_orig = true;
- orig_offset = 0;
- }
-
- if (!psock->rx_skb_nextp) {
- /* We are going to append to the frags_list of head.
- * Need to unshare the frag_list.
- */
- err = skb_unclone(head, GFP_ATOMIC);
- if (err) {
- KCM_STATS_INCR(psock->stats.rx_mem_fail);
- desc->error = err;
- return 0;
- }
-
- if (unlikely(skb_shinfo(head)->frag_list)) {
- /* We can't append to an sk_buff that already
- * has a frag_list. We create a new head, point
- * the frag_list of that to the old head, and
- * then are able to use the old head->next for
- * appending to the message.
- */
- if (WARN_ON(head->next)) {
- desc->error = -EINVAL;
- return 0;
- }
-
- skb = alloc_skb(0, GFP_ATOMIC);
- if (!skb) {
- KCM_STATS_INCR(psock->stats.rx_mem_fail);
- desc->error = -ENOMEM;
- return 0;
- }
- skb->len = head->len;
- skb->data_len = head->len;
- skb->truesize = head->truesize;
- *kcm_rx_msg(skb) = *kcm_rx_msg(head);
- psock->rx_skb_nextp = &head->next;
- skb_shinfo(skb)->frag_list = head;
- psock->rx_skb_head = skb;
- head = skb;
- } else {
- psock->rx_skb_nextp =
- &skb_shinfo(head)->frag_list;
- }
- }
- }
-
- while (eaten < orig_len) {
- /* Always clone since we will consume something */
- skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!skb) {
- KCM_STATS_INCR(psock->stats.rx_mem_fail);
- desc->error = -ENOMEM;
- break;
- }
-
- cand_len = orig_len - eaten;
-
- head = psock->rx_skb_head;
- if (!head) {
- head = skb;
- psock->rx_skb_head = head;
- /* Will set rx_skb_nextp on next packet if needed */
- psock->rx_skb_nextp = NULL;
- rxm = kcm_rx_msg(head);
- memset(rxm, 0, sizeof(*rxm));
- rxm->offset = orig_offset + eaten;
- } else {
- /* Unclone since we may be appending to an skb that we
- * already share a frag_list with.
- */
- err = skb_unclone(skb, GFP_ATOMIC);
- if (err) {
- KCM_STATS_INCR(psock->stats.rx_mem_fail);
- desc->error = err;
- break;
- }
-
- rxm = kcm_rx_msg(head);
- *psock->rx_skb_nextp = skb;
- psock->rx_skb_nextp = &skb->next;
- head->data_len += skb->len;
- head->len += skb->len;
- head->truesize += skb->truesize;
- }
-
- if (!rxm->full_len) {
- ssize_t len;
-
- len = KCM_RUN_FILTER(psock->bpf_prog, head);
-
- if (!len) {
- /* Need more header to determine length */
- if (!rxm->accum_len) {
- /* Start RX timer for new message */
- kcm_start_rx_timer(psock);
- }
- rxm->accum_len += cand_len;
- eaten += cand_len;
- KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
- WARN_ON(eaten != orig_len);
- break;
- } else if (len > psock->sk->sk_rcvbuf) {
- /* Message length exceeds maximum allowed */
- KCM_STATS_INCR(psock->stats.rx_msg_too_big);
- desc->error = -EMSGSIZE;
- psock->rx_skb_head = NULL;
- kcm_abort_rx_psock(psock, EMSGSIZE, head);
- break;
- } else if (len <= (ssize_t)head->len -
- skb->len - rxm->offset) {
- /* Length must be into new skb (and also
- * greater than zero)
- */
- KCM_STATS_INCR(psock->stats.rx_bad_hdr_len);
- desc->error = -EPROTO;
- psock->rx_skb_head = NULL;
- kcm_abort_rx_psock(psock, EPROTO, head);
- break;
- }
-
- rxm->full_len = len;
- }
-
- extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
-
- if (extra < 0) {
- /* Message not complete yet. */
- if (rxm->full_len - rxm->accum_len >
- tcp_inq(psock->sk)) {
- /* Don't have the whole messages in the socket
- * buffer. Set psock->rx_need_bytes to wait for
- * the rest of the message. Also, set "early
- * eaten" since we've already buffered the skb
- * but don't consume yet per tcp_read_sock.
- */
-
- if (!rxm->accum_len) {
- /* Start RX timer for new message */
- kcm_start_rx_timer(psock);
- }
-
- psock->rx_need_bytes = rxm->full_len -
- rxm->accum_len;
- rxm->accum_len += cand_len;
- rxm->early_eaten = cand_len;
- KCM_STATS_ADD(psock->stats.rx_bytes, cand_len);
- desc->count = 0; /* Stop reading socket */
- break;
- }
- rxm->accum_len += cand_len;
- eaten += cand_len;
- WARN_ON(eaten != orig_len);
- break;
- }
-
- /* Positive extra indicates ore bytes than needed for the
- * message
- */
-
- WARN_ON(extra > cand_len);
-
- eaten += (cand_len - extra);
-
- /* Hurray, we have a new message! */
- del_timer(&psock->rx_msg_timer);
- psock->rx_skb_head = NULL;
- KCM_STATS_INCR(psock->stats.rx_msgs);
-
-try_queue:
- kcm = reserve_rx_kcm(psock, head);
- if (!kcm) {
- /* Unable to reserve a KCM, message is held in psock. */
- break;
- }
-
- if (kcm_queue_rcv_skb(&kcm->sk, head)) {
- /* Should mean socket buffer full */
- unreserve_rx_kcm(psock, false);
- goto try_queue;
- }
- }
-
- if (cloned_orig)
- kfree_skb(orig_skb);
-
- KCM_STATS_ADD(psock->stats.rx_bytes, eaten);
-
- return eaten;
-}
-
-/* Called with lock held on lower socket */
-static int psock_tcp_read_sock(struct kcm_psock *psock)
-{
- read_descriptor_t desc;
-
- desc.arg.data = psock;
- desc.error = 0;
- desc.count = 1; /* give more than one skb per call */
-
- /* sk should be locked here, so okay to do tcp_read_sock */
- tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
-
- unreserve_rx_kcm(psock, true);
-
- return desc.error;
-}
-
/* Lower sock lock held */
-static void psock_tcp_data_ready(struct sock *sk)
+static void psock_data_ready(struct sock *sk)
{
struct kcm_psock *psock;
read_lock_bh(&sk->sk_callback_lock);
psock = (struct kcm_psock *)sk->sk_user_data;
- if (unlikely(!psock || psock->rx_stopped))
- goto out;
-
- if (psock->ready_rx_msg)
- goto out;
-
- if (psock->rx_need_bytes) {
- if (tcp_inq(sk) >= psock->rx_need_bytes)
- psock->rx_need_bytes = 0;
- else
- goto out;
- }
+ if (likely(psock))
+ strp_data_ready(&psock->strp);
- if (psock_tcp_read_sock(psock) == -ENOMEM)
- queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
-
-out:
read_unlock_bh(&sk->sk_callback_lock);
}
-static void do_psock_rx_work(struct kcm_psock *psock)
+/* Called with lower sock held */
+static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
{
- read_descriptor_t rd_desc;
- struct sock *csk = psock->sk;
-
- /* We need the read lock to synchronize with psock_tcp_data_ready. We
- * need the socket lock for calling tcp_read_sock.
- */
- lock_sock(csk);
- read_lock_bh(&csk->sk_callback_lock);
-
- if (unlikely(csk->sk_user_data != psock))
- goto out;
-
- if (unlikely(psock->rx_stopped))
- goto out;
-
- if (psock->ready_rx_msg)
- goto out;
-
- rd_desc.arg.data = psock;
+ struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
+ struct kcm_sock *kcm;
- if (psock_tcp_read_sock(psock) == -ENOMEM)
- queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+try_queue:
+ kcm = reserve_rx_kcm(psock, skb);
+ if (!kcm) {
+ /* Unable to reserve a KCM, message is held in psock and strp
+ * is paused.
+ */
+ return;
+ }
-out:
- read_unlock_bh(&csk->sk_callback_lock);
- release_sock(csk);
+ if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+ /* Should mean socket buffer full */
+ unreserve_rx_kcm(psock, false);
+ goto try_queue;
+ }
}
-static void psock_rx_work(struct work_struct *w)
+static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
{
- do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+ struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
+ struct bpf_prog *prog = psock->bpf_prog;
+
+ return (*prog->bpf_func)(skb, prog->insnsi);
}
-static void psock_rx_delayed_work(struct work_struct *w)
+static int kcm_read_sock_done(struct strparser *strp, int err)
{
- do_psock_rx_work(container_of(w, struct kcm_psock,
- rx_delayed_work.work));
+ struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
+
+ unreserve_rx_kcm(psock, true);
+
+ return err;
}
-static void psock_tcp_state_change(struct sock *sk)
+static void psock_state_change(struct sock *sk)
{
/* TCP only does a POLLIN for a half close. Do a POLLHUP here
* since application will normally not poll with POLLIN
@@ -703,7 +402,7 @@ static void psock_tcp_state_change(struct sock *sk)
report_csk_error(sk, EPIPE);
}
-static void psock_tcp_write_space(struct sock *sk)
+static void psock_write_space(struct sock *sk)
{
struct kcm_psock *psock;
struct kcm_mux *mux;
@@ -714,14 +413,13 @@ static void psock_tcp_write_space(struct sock *sk)
psock = (struct kcm_psock *)sk->sk_user_data;
if (unlikely(!psock))
goto out;
-
mux = psock->mux;
spin_lock_bh(&mux->lock);
/* Check if the socket is reserved so someone is waiting for sending. */
kcm = psock->tx_kcm;
- if (kcm)
+ if (kcm && !unlikely(kcm->tx_stopped))
queue_work(kcm_wq, &kcm->tx_work);
spin_unlock_bh(&mux->lock);
@@ -1231,23 +929,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
goto out_error;
}
- /* New message, alloc head skb */
- head = alloc_skb(0, sk->sk_allocation);
- while (!head) {
- kcm_push(kcm);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
-
+ if (msg_data_left(msg)) {
+ /* New message, alloc head skb */
head = alloc_skb(0, sk->sk_allocation);
- }
+ while (!head) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
- skb = head;
+ head = alloc_skb(0, sk->sk_allocation);
+ }
- /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
- * csum_and_copy_from_iter from skb_do_copy_data_nocache.
- */
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb = head;
+
+ /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+ * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+ */
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
start:
while (msg_data_left(msg)) {
@@ -1320,10 +1020,12 @@ wait_for_memory:
if (eor) {
bool not_busy = skb_queue_empty(&sk->sk_write_queue);
- /* Message complete, queue it on send buffer */
- __skb_queue_tail(&sk->sk_write_queue, head);
- kcm->seq_skb = NULL;
- KCM_STATS_INCR(kcm->stats.tx_msgs);
+ if (head) {
+ /* Message complete, queue it on send buffer */
+ __skb_queue_tail(&sk->sk_write_queue, head);
+ kcm->seq_skb = NULL;
+ KCM_STATS_INCR(kcm->stats.tx_msgs);
+ }
if (msg->msg_flags & MSG_BATCH) {
kcm->tx_wait_more = true;
@@ -1342,8 +1044,10 @@ wait_for_memory:
} else {
/* Message not complete, save state */
partial_message:
- kcm->seq_skb = head;
- kcm_tx_msg(head)->last_skb = skb;
+ if (head) {
+ kcm->seq_skb = head;
+ kcm_tx_msg(head)->last_skb = skb;
+ }
}
KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
@@ -1412,7 +1116,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
struct kcm_sock *kcm = kcm_sk(sk);
int err = 0;
long timeo;
- struct kcm_rx_msg *rxm;
+ struct strp_rx_msg *rxm;
int copied = 0;
struct sk_buff *skb;
@@ -1426,7 +1130,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
/* Okay, have a message on the receive queue */
- rxm = kcm_rx_msg(skb);
+ rxm = strp_rx_msg(skb);
if (len > rxm->full_len)
len = rxm->full_len;
@@ -1462,19 +1166,6 @@ out:
return copied ? : err;
}
-static ssize_t kcm_sock_splice(struct sock *sk,
- struct pipe_inode_info *pipe,
- struct splice_pipe_desc *spd)
-{
- int ret;
-
- release_sock(sk);
- ret = splice_to_pipe(pipe, spd);
- lock_sock(sk);
-
- return ret;
-}
-
static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
@@ -1482,7 +1173,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
long timeo;
- struct kcm_rx_msg *rxm;
+ struct strp_rx_msg *rxm;
int err = 0;
ssize_t copied;
struct sk_buff *skb;
@@ -1499,13 +1190,12 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
/* Okay, have a message on the receive queue */
- rxm = kcm_rx_msg(skb);
+ rxm = strp_rx_msg(skb);
if (len > rxm->full_len)
len = rxm->full_len;
- copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags,
- kcm_sock_splice);
+ copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags);
if (copied < 0) {
err = copied;
goto err_out;
@@ -1675,15 +1365,6 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
spin_unlock_bh(&mux->rx_lock);
}
-static void kcm_rx_msg_timeout(unsigned long arg)
-{
- struct kcm_psock *psock = (struct kcm_psock *)arg;
-
- /* Message assembly timed out */
- KCM_STATS_INCR(psock->stats.rx_msg_timeouts);
- kcm_abort_rx_psock(psock, ETIMEDOUT, NULL);
-}
-
static int kcm_attach(struct socket *sock, struct socket *csock,
struct bpf_prog *prog)
{
@@ -1693,19 +1374,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
struct kcm_psock *psock = NULL, *tpsock;
struct list_head *head;
int index = 0;
-
- if (csock->ops->family != PF_INET &&
- csock->ops->family != PF_INET6)
- return -EINVAL;
+ struct strp_callbacks cb;
+ int err;
csk = csock->sk;
if (!csk)
return -EINVAL;
- /* Only support TCP for now */
- if (csk->sk_protocol != IPPROTO_TCP)
- return -EINVAL;
-
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
if (!psock)
return -ENOMEM;
@@ -1714,11 +1389,16 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
psock->sk = csk;
psock->bpf_prog = prog;
- setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout,
- (unsigned long)psock);
+ cb.rcv_msg = kcm_rcv_strparser;
+ cb.abort_parser = NULL;
+ cb.parse_msg = kcm_parse_func_strparser;
+ cb.read_sock_done = kcm_read_sock_done;
- INIT_WORK(&psock->rx_work, psock_rx_work);
- INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+ err = strp_init(&psock->strp, csk, &cb);
+ if (err) {
+ kmem_cache_free(kcm_psockp, psock);
+ return err;
+ }
sock_hold(csk);
@@ -1727,9 +1407,9 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
psock->save_write_space = csk->sk_write_space;
psock->save_state_change = csk->sk_state_change;
csk->sk_user_data = psock;
- csk->sk_data_ready = psock_tcp_data_ready;
- csk->sk_write_space = psock_tcp_write_space;
- csk->sk_state_change = psock_tcp_state_change;
+ csk->sk_data_ready = psock_data_ready;
+ csk->sk_write_space = psock_write_space;
+ csk->sk_state_change = psock_state_change;
write_unlock_bh(&csk->sk_callback_lock);
/* Finished initialization, now add the psock to the MUX. */
@@ -1751,7 +1431,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
spin_unlock_bh(&mux->lock);
/* Schedule RX work in case there are already bytes queued */
- queue_work(kcm_wq, &psock->rx_work);
+ strp_check_rcv(&psock->strp);
return 0;
}
@@ -1791,6 +1471,8 @@ static void kcm_unattach(struct kcm_psock *psock)
struct sock *csk = psock->sk;
struct kcm_mux *mux = psock->mux;
+ lock_sock(csk);
+
/* Stop getting callbacks from TCP socket. After this there should
* be no way to reserve a kcm for this psock.
*/
@@ -1799,7 +1481,7 @@ static void kcm_unattach(struct kcm_psock *psock)
csk->sk_data_ready = psock->save_data_ready;
csk->sk_write_space = psock->save_write_space;
csk->sk_state_change = psock->save_state_change;
- psock->rx_stopped = 1;
+ strp_stop(&psock->strp);
if (WARN_ON(psock->rx_kcm)) {
write_unlock_bh(&csk->sk_callback_lock);
@@ -1822,18 +1504,17 @@ static void kcm_unattach(struct kcm_psock *psock)
write_unlock_bh(&csk->sk_callback_lock);
- del_timer_sync(&psock->rx_msg_timer);
- cancel_work_sync(&psock->rx_work);
- cancel_delayed_work_sync(&psock->rx_delayed_work);
+ /* Call strp_done without sock lock */
+ release_sock(csk);
+ strp_done(&psock->strp);
+ lock_sock(csk);
bpf_prog_put(psock->bpf_prog);
- kfree_skb(psock->rx_skb_head);
- psock->rx_skb_head = NULL;
-
spin_lock_bh(&mux->lock);
aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
+ save_strp_stats(&psock->strp, &mux->aggregate_strp_stats);
KCM_STATS_INCR(mux->stats.psock_unattach);
@@ -1876,6 +1557,8 @@ no_reserved:
fput(csk->sk_socket->file);
kmem_cache_free(kcm_psockp, psock);
}
+
+ release_sock(csk);
}
static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
@@ -1916,6 +1599,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
spin_unlock_bh(&mux->lock);
+ /* Lower socket lock should already be held */
kcm_unattach(psock);
err = 0;
@@ -2073,6 +1757,8 @@ static void release_mux(struct kcm_mux *mux)
aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
aggregate_psock_stats(&mux->aggregate_psock_stats,
&knet->aggregate_psock_stats);
+ aggregate_strp_stats(&mux->aggregate_strp_stats,
+ &knet->aggregate_strp_stats);
list_del_rcu(&mux->kcm_mux_list);
knet->count--;
mutex_unlock(&knet->mutex);
@@ -2152,6 +1838,13 @@ static int kcm_release(struct socket *sock)
* it will just return.
*/
__skb_queue_purge(&sk->sk_write_queue);
+
+ /* Set tx_stopped. This is checked when psock is bound to a kcm and we
+ * get a writespace callback. This prevents further work being queued
+ * from the callback (unbinding the psock occurs after canceling work.
+ */
+ kcm->tx_stopped = 1;
+
release_sock(sk);
spin_lock_bh(&mux->lock);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f9c9ecb0cdd3..c6252ed42c1d 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -36,7 +36,7 @@
#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
-static int pfkey_net_id __read_mostly;
+static unsigned int pfkey_net_id __read_mostly;
struct netns_pfkey {
/* List of all pfkey sockets. */
struct hlist_head table;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index a2ed3bda4ddc..85948c69b236 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -715,7 +715,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
l2tp_info(session, L2TP_MSG_SEQ,
"%s: requested to enable seq numbers by LNS\n",
session->name);
- session->send_seq = -1;
+ session->send_seq = 1;
l2tp_session_set_header_len(session, tunnel->version);
}
} else {
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 5871537af387..aebf281d09ee 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -23,16 +23,6 @@
#define L2TP_HASH_BITS_2 8
#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2)
-/* Debug message categories for the DEBUG socket option */
-enum {
- L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if
- * compiled in) */
- L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel
- * interface */
- L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */
- L2TP_MSG_DATA = (1 << 3), /* data packets */
-};
-
struct sk_buff;
struct l2tp_stats {
@@ -139,7 +129,7 @@ struct l2tp_session {
void (*session_close)(struct l2tp_session *session);
void (*ref)(struct l2tp_session *session);
void (*deref)(struct l2tp_session *session);
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
void (*show)(struct seq_file *m, void *priv);
#endif
uint8_t priv[0]; /* private data */
@@ -273,6 +263,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
const struct l2tp_nl_cmd_ops *ops);
void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
+int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
/* Session reference counts. Incremented when code obtains a reference
* to a session.
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 57fc5a46ce06..e2c6ae024565 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -121,7 +121,7 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
}
-static struct net_device_ops l2tp_eth_netdev_ops = {
+static const struct net_device_ops l2tp_eth_netdev_ops = {
.ndo_init = l2tp_eth_dev_init,
.ndo_uninit = l2tp_eth_dev_uninit,
.ndo_start_xmit = l2tp_eth_dev_xmit,
@@ -195,7 +195,7 @@ static void l2tp_eth_delete(struct l2tp_session *session)
}
}
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
static void l2tp_eth_show(struct seq_file *m, void *arg)
{
struct l2tp_session *session = arg;
@@ -259,6 +259,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
session->mtu = dev->mtu - session->hdr_len;
dev->mtu = session->mtu;
dev->needed_headroom += session->hdr_len;
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
priv = netdev_priv(dev);
priv->dev = dev;
@@ -268,7 +270,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
priv->tunnel_sock = tunnel->sock;
session->recv_skb = l2tp_eth_dev_recv;
session->session_close = l2tp_eth_delete;
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
session->show = l2tp_eth_show;
#endif
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 42de4ccd159f..28c21546d5b6 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/ioctls.h>
#include <linux/icmp.h>
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -47,7 +48,8 @@ static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
return (struct l2tp_ip_sock *)sk;
}
-static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
+static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
+ __be32 raddr, int dif, u32 tunnel_id)
{
struct sock *sk;
@@ -61,7 +63,9 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif
if ((l2tp->conn_id == tunnel_id) &&
net_eq(sock_net(sk), net) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (!inet->inet_daddr || !raddr || inet->inet_daddr == raddr) &&
+ (!sk->sk_bound_dev_if || !dif ||
+ sk->sk_bound_dev_if == dif))
goto found;
}
@@ -70,15 +74,6 @@ found:
return sk;
}
-static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
-{
- struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id);
- if (sk)
- sock_hold(sk);
-
- return sk;
-}
-
/* When processing receive frames, there are two cases to
* consider. Data frames consist of a non-zero session-id and an
* optional cookie. Control frames consist of a regular L2TP header
@@ -182,15 +177,17 @@ pass_up:
struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
read_lock_bh(&l2tp_ip_lock);
- sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id);
+ sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr,
+ inet_iif(skb), tunnel_id);
+ if (!sk) {
+ read_unlock_bh(&l2tp_ip_lock);
+ goto discard;
+ }
+
+ sock_hold(sk);
read_unlock_bh(&l2tp_ip_lock);
}
- if (sk == NULL)
- goto discard;
-
- sock_hold(sk);
-
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
@@ -251,22 +248,17 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
int ret;
int chk_addr_ret;
- if (!sock_flag(sk, SOCK_ZAPPED))
- return -EINVAL;
if (addr_len < sizeof(struct sockaddr_l2tpip))
return -EINVAL;
if (addr->l2tp_family != AF_INET)
return -EINVAL;
- ret = -EADDRINUSE;
- read_lock_bh(&l2tp_ip_lock);
- if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
- sk->sk_bound_dev_if, addr->l2tp_conn_id))
- goto out_in_use;
+ lock_sock(sk);
- read_unlock_bh(&l2tp_ip_lock);
+ ret = -EINVAL;
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ goto out;
- lock_sock(sk);
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
goto out;
@@ -280,14 +272,22 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
- sk_dst_reset(sk);
+ write_lock_bh(&l2tp_ip_lock);
+ if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0,
+ sk->sk_bound_dev_if, addr->l2tp_conn_id)) {
+ write_unlock_bh(&l2tp_ip_lock);
+ ret = -EADDRINUSE;
+ goto out;
+ }
+
+ sk_dst_reset(sk);
l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id;
- write_lock_bh(&l2tp_ip_lock);
sk_add_bind_node(sk, &l2tp_ip_bind_table);
sk_del_node_init(sk);
write_unlock_bh(&l2tp_ip_lock);
+
ret = 0;
sock_reset_flag(sk, SOCK_ZAPPED);
@@ -295,11 +295,6 @@ out:
release_sock(sk);
return ret;
-
-out_in_use:
- read_unlock_bh(&l2tp_ip_lock);
-
- return ret;
}
static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -307,21 +302,24 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr;
int rc;
- if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
- return -EINVAL;
-
if (addr_len < sizeof(*lsa))
return -EINVAL;
if (ipv4_is_multicast(lsa->l2tp_addr.s_addr))
return -EINVAL;
- rc = ip4_datagram_connect(sk, uaddr, addr_len);
- if (rc < 0)
- return rc;
-
lock_sock(sk);
+ /* Must bind first - autobinding does not work */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ rc = -EINVAL;
+ goto out_sk;
+ }
+
+ rc = __ip4_datagram_connect(sk, uaddr, addr_len);
+ if (rc < 0)
+ goto out_sk;
+
l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
write_lock_bh(&l2tp_ip_lock);
@@ -329,7 +327,9 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
sk_add_bind_node(sk, &l2tp_ip_bind_table);
write_unlock_bh(&l2tp_ip_lock);
+out_sk:
release_sock(sk);
+
return rc;
}
@@ -338,7 +338,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags)
if (sock_flag(sk, SOCK_ZAPPED))
return 0;
- return udp_disconnect(sk, flags);
+ return __udp_disconnect(sk, flags);
}
static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
@@ -554,6 +554,30 @@ out:
return err ? err : copied;
}
+int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct sk_buff *skb;
+ int amount;
+
+ switch (cmd) {
+ case SIOCOUTQ:
+ amount = sk_wmem_alloc_get(sk);
+ break;
+ case SIOCINQ:
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ amount = skb ? skb->len : 0;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ break;
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(amount, (int __user *)arg);
+}
+EXPORT_SYMBOL(l2tp_ioctl);
+
static struct proto l2tp_ip_prot = {
.name = "L2TP/IP",
.owner = THIS_MODULE,
@@ -562,7 +586,7 @@ static struct proto l2tp_ip_prot = {
.bind = l2tp_ip_bind,
.connect = l2tp_ip_connect,
.disconnect = l2tp_ip_disconnect,
- .ioctl = udp_ioctl,
+ .ioctl = l2tp_ioctl,
.destroy = l2tp_ip_destroy_sock,
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index ea2ae6664cc8..f47c45250f86 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -59,12 +59,14 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
struct in6_addr *laddr,
+ const struct in6_addr *raddr,
int dif, u32 tunnel_id)
{
struct sock *sk;
sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
- const struct in6_addr *addr = inet6_rcv_saddr(sk);
+ const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
+ const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
if (l2tp == NULL)
@@ -72,8 +74,10 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
if ((l2tp->conn_id == tunnel_id) &&
net_eq(sock_net(sk), net) &&
- !(addr && ipv6_addr_equal(addr, laddr)) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) &&
+ (!raddr || ipv6_addr_any(sk_raddr) || ipv6_addr_equal(sk_raddr, raddr)) &&
+ (!sk->sk_bound_dev_if || !dif ||
+ sk->sk_bound_dev_if == dif))
goto found;
}
@@ -82,17 +86,6 @@ found:
return sk;
}
-static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
- struct in6_addr *laddr,
- int dif, u32 tunnel_id)
-{
- struct sock *sk = __l2tp_ip6_bind_lookup(net, laddr, dif, tunnel_id);
- if (sk)
- sock_hold(sk);
-
- return sk;
-}
-
/* When processing receive frames, there are two cases to
* consider. Data frames consist of a non-zero session-id and an
* optional cookie. Control frames consist of a regular L2TP header
@@ -196,16 +189,17 @@ pass_up:
struct ipv6hdr *iph = ipv6_hdr(skb);
read_lock_bh(&l2tp_ip6_lock);
- sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
- 0, tunnel_id);
+ sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr,
+ inet6_iif(skb), tunnel_id);
+ if (!sk) {
+ read_unlock_bh(&l2tp_ip6_lock);
+ goto discard;
+ }
+
+ sock_hold(sk);
read_unlock_bh(&l2tp_ip6_lock);
}
- if (sk == NULL)
- goto discard;
-
- sock_hold(sk);
-
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
@@ -266,11 +260,10 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
struct net *net = sock_net(sk);
__be32 v4addr = 0;
+ int bound_dev_if;
int addr_type;
int err;
- if (!sock_flag(sk, SOCK_ZAPPED))
- return -EINVAL;
if (addr->l2tp_family != AF_INET6)
return -EINVAL;
if (addr_len < sizeof(*addr))
@@ -286,41 +279,34 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr_type & IPV6_ADDR_MULTICAST)
return -EADDRNOTAVAIL;
- err = -EADDRINUSE;
- read_lock_bh(&l2tp_ip6_lock);
- if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
- sk->sk_bound_dev_if, addr->l2tp_conn_id))
- goto out_in_use;
- read_unlock_bh(&l2tp_ip6_lock);
-
lock_sock(sk);
err = -EINVAL;
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ goto out_unlock;
+
if (sk->sk_state != TCP_CLOSE)
goto out_unlock;
+ bound_dev_if = sk->sk_bound_dev_if;
+
/* Check if the address belongs to the host. */
rcu_read_lock();
if (addr_type != IPV6_ADDR_ANY) {
struct net_device *dev = NULL;
if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (addr_len >= sizeof(struct sockaddr_in6) &&
- addr->l2tp_scope_id) {
- /* Override any existing binding, if another
- * one is supplied by user.
- */
- sk->sk_bound_dev_if = addr->l2tp_scope_id;
- }
+ if (addr->l2tp_scope_id)
+ bound_dev_if = addr->l2tp_scope_id;
/* Binding to link-local address requires an
- interface */
- if (!sk->sk_bound_dev_if)
+ * interface.
+ */
+ if (!bound_dev_if)
goto out_unlock_rcu;
err = -ENODEV;
- dev = dev_get_by_index_rcu(sock_net(sk),
- sk->sk_bound_dev_if);
+ dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if);
if (!dev)
goto out_unlock_rcu;
}
@@ -335,13 +321,22 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
rcu_read_unlock();
- inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+ write_lock_bh(&l2tp_ip6_lock);
+ if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if,
+ addr->l2tp_conn_id)) {
+ write_unlock_bh(&l2tp_ip6_lock);
+ err = -EADDRINUSE;
+ goto out_unlock;
+ }
+
+ inet->inet_saddr = v4addr;
+ inet->inet_rcv_saddr = v4addr;
+ sk->sk_bound_dev_if = bound_dev_if;
sk->sk_v6_rcv_saddr = addr->l2tp_addr;
np->saddr = addr->l2tp_addr;
l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id;
- write_lock_bh(&l2tp_ip6_lock);
sk_add_bind_node(sk, &l2tp_ip6_bind_table);
sk_del_node_init(sk);
write_unlock_bh(&l2tp_ip6_lock);
@@ -354,10 +349,7 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock:
release_sock(sk);
- return err;
-out_in_use:
- read_unlock_bh(&l2tp_ip6_lock);
return err;
}
@@ -370,9 +362,6 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_type;
int rc;
- if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
- return -EINVAL;
-
if (addr_len < sizeof(*lsa))
return -EINVAL;
@@ -389,10 +378,18 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
return -EINVAL;
}
- rc = ip6_datagram_connect(sk, uaddr, addr_len);
-
lock_sock(sk);
+ /* Must bind first - autobinding does not work */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ rc = -EINVAL;
+ goto out_sk;
+ }
+
+ rc = __ip6_datagram_connect(sk, uaddr, addr_len);
+ if (rc < 0)
+ goto out_sk;
+
l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
write_lock_bh(&l2tp_ip6_lock);
@@ -400,6 +397,7 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
sk_add_bind_node(sk, &l2tp_ip6_bind_table);
write_unlock_bh(&l2tp_ip6_lock);
+out_sk:
release_sock(sk);
return rc;
@@ -410,7 +408,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags)
if (sock_flag(sk, SOCK_ZAPPED))
return 0;
- return udp_disconnect(sk, flags);
+ return __udp_disconnect(sk, flags);
}
static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
@@ -519,6 +517,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
@@ -723,7 +722,7 @@ static struct proto l2tp_ip6_prot = {
.bind = l2tp_ip6_bind,
.connect = l2tp_ip6_connect,
.disconnect = l2tp_ip6_disconnect,
- .ioctl = udp_ioctl,
+ .ioctl = l2tp_ioctl,
.destroy = l2tp_ip6_destroy_sock,
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 1d02e8d20e56..3620fba31786 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -31,14 +31,7 @@
#include "l2tp_core.h"
-static struct genl_family l2tp_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = L2TP_GENL_NAME,
- .version = L2TP_GENL_VERSION,
- .hdrsize = 0,
- .maxattr = L2TP_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family l2tp_nl_family;
static const struct genl_multicast_group l2tp_multicast_group[] = {
{
@@ -227,14 +220,14 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]);
if (info->attrs[L2TP_ATTR_UDP_DPORT])
cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]);
- if (info->attrs[L2TP_ATTR_UDP_CSUM])
- cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]);
+ cfg.use_udp_checksums = nla_get_flag(
+ info->attrs[L2TP_ATTR_UDP_CSUM]);
#if IS_ENABLED(CONFIG_IPV6)
- if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX])
- cfg.udp6_zero_tx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
- if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX])
- cfg.udp6_zero_rx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
+ cfg.udp6_zero_tx_checksums = nla_get_flag(
+ info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
+ cfg.udp6_zero_rx_checksums = nla_get_flag(
+ info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
#endif
}
@@ -386,9 +379,24 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
+ switch (sk->sk_family) {
+ case AF_INET:
+ if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
+ goto nla_put_failure;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ if (udp_get_no_check6_tx(sk) &&
+ nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX))
+ goto nla_put_failure;
+ if (udp_get_no_check6_rx(sk) &&
+ nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX))
+ goto nla_put_failure;
+ break;
+#endif
+ }
if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
- nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)) ||
- nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
+ nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)))
goto nla_put_failure;
/* NOBREAK */
case L2TP_ENCAPTYPE_IP:
@@ -867,7 +875,7 @@ out:
return skb->len;
}
-static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
+static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
[L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, },
[L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, },
[L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, },
@@ -977,6 +985,19 @@ static const struct genl_ops l2tp_nl_ops[] = {
},
};
+static struct genl_family l2tp_nl_family __ro_after_init = {
+ .name = L2TP_GENL_NAME,
+ .version = L2TP_GENL_VERSION,
+ .hdrsize = 0,
+ .maxattr = L2TP_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = l2tp_nl_ops,
+ .n_ops = ARRAY_SIZE(l2tp_nl_ops),
+ .mcgrps = l2tp_multicast_group,
+ .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group),
+};
+
int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
{
int ret;
@@ -1010,12 +1031,10 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
}
EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
-static int l2tp_nl_init(void)
+static int __init l2tp_nl_init(void)
{
pr_info("L2TP netlink interface\n");
- return genl_register_family_with_ops_groups(&l2tp_nl_family,
- l2tp_nl_ops,
- l2tp_multicast_group);
+ return genl_register_family(&l2tp_nl_family);
}
static void l2tp_nl_cleanup(void)
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 232cb92033e8..36cc56fd0418 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -177,7 +177,7 @@ static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
if (!pskb_may_pull(skb, 2))
return 1;
- if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03))
+ if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI))
skb_pull(skb, 2);
return 0;
@@ -231,14 +231,14 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
- l2tp_dbg(session, PPPOL2TP_MSG_DATA,
+ l2tp_dbg(session, L2TP_MSG_DATA,
"%s: recv %d byte data frame, passing to ppp\n",
session->name, data_len);
po = pppox_sk(sk);
ppp_input(&po->chan, skb);
} else {
- l2tp_dbg(session, PPPOL2TP_MSG_DATA,
+ l2tp_dbg(session, L2TP_MSG_DATA,
"%s: recv %d byte data frame, passing to L2TP socket\n",
session->name, data_len);
@@ -251,7 +251,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
return;
no_sock:
- l2tp_info(session, PPPOL2TP_MSG_DATA, "%s: no socket\n", session->name);
+ l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name);
kfree_skb(skb);
}
@@ -282,7 +282,6 @@ static void pppol2tp_session_sock_put(struct l2tp_session *session)
static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
size_t total_len)
{
- static const unsigned char ppph[2] = { 0xff, 0x03 };
struct sock *sk = sock->sk;
struct sk_buff *skb;
int error;
@@ -312,7 +311,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
error = -ENOMEM;
skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) +
uhlen + session->hdr_len +
- sizeof(ppph) + total_len,
+ 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */
0, GFP_KERNEL);
if (!skb)
goto error_put_sess_tun;
@@ -325,8 +324,8 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
skb_reserve(skb, uhlen);
/* Add PPP header */
- skb->data[0] = ppph[0];
- skb->data[1] = ppph[1];
+ skb->data[0] = PPP_ALLSTATIONS;
+ skb->data[1] = PPP_UI;
skb_put(skb, 2);
/* Copy user data into skb */
@@ -369,7 +368,6 @@ error:
*/
static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
{
- static const u8 ppph[2] = { 0xff, 0x03 };
struct sock *sk = (struct sock *) chan->private;
struct sock *sk_tun;
struct l2tp_session *session;
@@ -398,14 +396,14 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
sizeof(struct iphdr) + /* IP header */
uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */
session->hdr_len + /* L2TP header */
- sizeof(ppph); /* PPP header */
+ 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */
if (skb_cow_head(skb, headroom))
goto abort_put_sess_tun;
/* Setup PPP header */
- __skb_push(skb, sizeof(ppph));
- skb->data[0] = ppph[0];
- skb->data[1] = ppph[1];
+ __skb_push(skb, 2);
+ skb->data[0] = PPP_ALLSTATIONS;
+ skb->data[1] = PPP_UI;
local_bh_disable();
l2tp_xmit_skb(session, skb, session->hdr_len);
@@ -440,7 +438,7 @@ static void pppol2tp_session_close(struct l2tp_session *session)
BUG_ON(session->magic != L2TP_SESSION_MAGIC);
if (sock) {
- inet_shutdown(sock, 2);
+ inet_shutdown(sock, SEND_SHUTDOWN);
/* Don't let the session go away before our socket does */
l2tp_session_inc_refcount(session);
}
@@ -554,7 +552,7 @@ out:
return error;
}
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
static void pppol2tp_show(struct seq_file *m, void *arg)
{
struct l2tp_session *session = arg;
@@ -725,7 +723,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
session->recv_skb = pppol2tp_recv;
session->session_close = pppol2tp_session_close;
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
session->show = pppol2tp_show;
#endif
@@ -775,7 +773,7 @@ out_no_ppp:
/* This is how we get the session context from the socket. */
sk->sk_user_data = session;
sk->sk_state = PPPOX_CONNECTED;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
session->name);
end:
@@ -829,7 +827,7 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
ps = l2tp_session_priv(session);
ps->tunnel_sock = tunnel->sock;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
session->name);
error = 0;
@@ -991,7 +989,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
struct l2tp_tunnel *tunnel = session->tunnel;
struct pppol2tp_ioc_stats stats;
- l2tp_dbg(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_dbg(session, L2TP_MSG_CONTROL,
"%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
session->name, cmd, arg);
@@ -1011,7 +1009,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mtu=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n",
session->name, session->mtu);
err = 0;
break;
@@ -1027,7 +1025,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
session->mtu = ifr.ifr_mtu;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mtu=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n",
session->name, session->mtu);
err = 0;
break;
@@ -1041,7 +1039,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (put_user(session->mru, (int __user *) arg))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mru=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n",
session->name, session->mru);
err = 0;
break;
@@ -1056,7 +1054,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
break;
session->mru = val;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mru=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
session->name, session->mru);
err = 0;
break;
@@ -1066,7 +1064,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (put_user(ps->flags, (int __user *) arg))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get flags=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
session->name, ps->flags);
err = 0;
break;
@@ -1076,7 +1074,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (get_user(val, (int __user *) arg))
break;
ps->flags = val;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set flags=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
session->name, ps->flags);
err = 0;
break;
@@ -1093,7 +1091,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (copy_to_user((void __user *) arg, &stats,
sizeof(stats)))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
session->name);
err = 0;
break;
@@ -1121,7 +1119,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
struct sock *sk;
struct pppol2tp_ioc_stats stats;
- l2tp_dbg(tunnel, PPPOL2TP_MSG_CONTROL,
+ l2tp_dbg(tunnel, L2TP_MSG_CONTROL,
"%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
tunnel->name, cmd, arg);
@@ -1157,7 +1155,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
err = -EFAULT;
break;
}
- l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n",
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
tunnel->name);
err = 0;
break;
@@ -1247,7 +1245,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
tunnel->debug = val;
- l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n",
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
tunnel->name, tunnel->debug);
break;
@@ -1274,8 +1272,8 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
err = -EINVAL;
break;
}
- session->recv_seq = val ? -1 : 0;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ session->recv_seq = !!val;
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set recv_seq=%d\n",
session->name, session->recv_seq);
break;
@@ -1285,7 +1283,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
err = -EINVAL;
break;
}
- session->send_seq = val ? -1 : 0;
+ session->send_seq = !!val;
{
struct sock *ssk = ps->sock;
struct pppox_sock *po = pppox_sk(ssk);
@@ -1293,7 +1291,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
}
l2tp_session_set_header_len(session, session->tunnel->version);
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set send_seq=%d\n",
session->name, session->send_seq);
break;
@@ -1303,21 +1301,21 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
err = -EINVAL;
break;
}
- session->lns_mode = val ? -1 : 0;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ session->lns_mode = !!val;
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set lns_mode=%d\n",
session->name, session->lns_mode);
break;
case PPPOL2TP_SO_DEBUG:
session->debug = val;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
session->name, session->debug);
break;
case PPPOL2TP_SO_REORDERTO:
session->reorder_timeout = msecs_to_jiffies(val);
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set reorder_timeout=%d\n",
session->name, session->reorder_timeout);
break;
@@ -1398,7 +1396,7 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
*val = tunnel->debug;
- l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get debug=%x\n",
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n",
tunnel->name, tunnel->debug);
break;
@@ -1421,31 +1419,31 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_RECVSEQ:
*val = session->recv_seq;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get recv_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_SENDSEQ:
*val = session->send_seq;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get send_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_LNSMODE:
*val = session->lns_mode;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get lns_mode=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_DEBUG:
*val = session->debug;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get debug=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n",
session->name, *val);
break;
case PPPOL2TP_SO_REORDERTO:
*val = (int) jiffies_to_msecs(session->reorder_timeout);
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get reorder_timeout=%d\n", session->name, *val);
break;
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index c4a1c3e84e12..8da86ceca33d 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
/**
- * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns
- * cached route for L3 master device if relevant
- * to flow
+ * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link
+ * local and multicast addresses
* @net: network namespace for device index lookup
* @fl6: IPv6 flow struct for lookup
*/
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
- struct flowi6 *fl6)
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
+ struct flowi6 *fl6)
{
struct dst_entry *dst = NULL;
struct net_device *dev;
@@ -121,70 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
dev = netdev_master_upper_dev_get_rcu(dev);
if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_get_rt6_dst)
- dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+ dev->l3mdev_ops->l3mdev_link_scope_lookup)
+ dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
rcu_read_unlock();
}
return dst;
}
-EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst);
-
-/**
- * l3mdev_get_saddr - get source address for a flow based on an interface
- * enslaved to an L3 master device
- * @net: network namespace for device index lookup
- * @ifindex: Interface index
- * @fl4: IPv4 flow struct
- */
-
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
-{
- struct net_device *dev;
- int rc = 0;
-
- if (ifindex) {
- rcu_read_lock();
-
- dev = dev_get_by_index_rcu(net, ifindex);
- if (dev && netif_is_l3_slave(dev))
- dev = netdev_master_upper_dev_get_rcu(dev);
-
- if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_get_saddr)
- rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
-
- rcu_read_unlock();
- }
-
- return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
-
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
- struct flowi6 *fl6)
-{
- struct net_device *dev;
- int rc = 0;
-
- if (fl6->flowi6_oif) {
- rcu_read_lock();
-
- dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
- if (dev && netif_is_l3_slave(dev))
- dev = netdev_master_upper_dev_get_rcu(dev);
-
- if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_get_saddr6)
- rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
-
- rcu_read_unlock();
- }
-
- return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
+EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
/**
* l3mdev_fib_rule_match - Determine if flowi references an
@@ -222,3 +166,38 @@ out:
return rc;
}
+
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+ struct net_device *dev;
+ int ifindex;
+
+ rcu_read_lock();
+
+ if (fl->flowi_oif) {
+ dev = dev_get_by_index_rcu(net, fl->flowi_oif);
+ if (dev) {
+ ifindex = l3mdev_master_ifindex_rcu(dev);
+ if (ifindex) {
+ fl->flowi_oif = ifindex;
+ fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+ goto out;
+ }
+ }
+ }
+
+ if (fl->flowi_iif) {
+ dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+ if (dev) {
+ ifindex = l3mdev_master_ifindex_rcu(dev);
+ if (ifindex) {
+ fl->flowi_iif = ifindex;
+ fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+ }
+ }
+ }
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(l3mdev_update_flow);
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index fc60d9d738b5..b50b64ac8815 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -33,7 +33,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 182470847fcf..d5d2110eb717 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -31,7 +31,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index 482c94d9d958..eda726e22f64 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -29,7 +29,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 3c1914df641f..75efde3e616c 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -28,7 +28,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 355cc3b6fa4d..1a5535bc3b8d 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -29,7 +29,7 @@
#include <linux/inet.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8ae3ed97d95c..5e9296382420 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256];
static struct sockaddr_llc llc_ui_addrnull;
static const struct proto_ops llc_ui_ops;
-static long llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static bool llc_ui_wait_for_conn(struct sock *sk, long timeout);
static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
@@ -532,12 +532,12 @@ out:
static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc = 0;
+ add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
+ if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -547,39 +547,39 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
break;
rc = 0;
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
-static long llc_ui_wait_for_conn(struct sock *sk, long timeout)
+static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
+ if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait))
break;
if (signal_pending(current) || !timeout)
break;
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return timeout;
}
static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct llc_sock *llc = llc_sk(sk);
int rc;
+ add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
rc = 0;
if (sk_wait_event(sk, &timeout,
(sk->sk_shutdown & RCV_SHUTDOWN) ||
(!llc_data_accept_state(llc->state) &&
!llc->remote_busy_flag &&
- !llc->p_flag)))
+ !llc->p_flag), &wait))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -588,7 +588,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
if (!timeout)
break;
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 3e821daf9dd4..8bc5a1bd2d45 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
* another trick required to cope with how the PROCOM state
* machine works. -acme
*/
+ skb_orphan(skb);
+ sock_hold(sk);
skb->sk = sk;
+ skb->destructor = sock_efree;
}
if (!sock_owned_by_user(sk))
llc_conn_rcv(sk, skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index d0e1e804ebd7..5404d0d195cc 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
ev->type = LLC_SAP_EV_TYPE_PDU;
ev->reason = 0;
+ skb_orphan(skb);
+ sock_hold(sk);
skb->sk = sk;
+ skb->destructor = sock_efree;
llc_sap_state_process(sap, skb);
}
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index f9137a8341f4..282912245938 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -19,6 +19,7 @@ mac80211-y := \
aes_gcm.o \
aes_cmac.o \
aes_gmac.o \
+ fils_aead.o \
cfg.o \
ethtool.o \
rx.o \
@@ -60,4 +61,4 @@ rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
-ccflags-y += -D__CHECK_ENDIAN__ -DDEBUG
+ccflags-y += -DDEBUG
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index 7663c28ba353..a4e0d59a40dd 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -18,21 +18,24 @@
#include "key.h"
#include "aes_ccm.h"
-void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
- u8 *data, size_t data_len, u8 *mic,
- size_t mic_len)
+int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic,
+ size_t mic_len)
{
struct scatterlist sg[3];
+ struct aead_request *aead_req;
+ int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
+ u8 *__aad;
- char aead_req_data[sizeof(struct aead_request) +
- crypto_aead_reqsize(tfm)]
- __aligned(__alignof__(struct aead_request));
- struct aead_request *aead_req = (void *) aead_req_data;
+ aead_req = kzalloc(reqsize + CCM_AAD_LEN, GFP_ATOMIC);
+ if (!aead_req)
+ return -ENOMEM;
- memset(aead_req, 0, sizeof(aead_req_data));
+ __aad = (u8 *)aead_req + reqsize;
+ memcpy(__aad, aad, CCM_AAD_LEN);
sg_init_table(sg, 3);
- sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad));
sg_set_buf(&sg[1], data, data_len);
sg_set_buf(&sg[2], mic, mic_len);
@@ -41,6 +44,9 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
aead_request_set_ad(aead_req, sg[0].length);
crypto_aead_encrypt(aead_req);
+ kzfree(aead_req);
+
+ return 0;
}
int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
@@ -48,18 +54,23 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
size_t mic_len)
{
struct scatterlist sg[3];
- char aead_req_data[sizeof(struct aead_request) +
- crypto_aead_reqsize(tfm)]
- __aligned(__alignof__(struct aead_request));
- struct aead_request *aead_req = (void *) aead_req_data;
+ struct aead_request *aead_req;
+ int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
+ u8 *__aad;
+ int err;
if (data_len == 0)
return -EINVAL;
- memset(aead_req, 0, sizeof(aead_req_data));
+ aead_req = kzalloc(reqsize + CCM_AAD_LEN, GFP_ATOMIC);
+ if (!aead_req)
+ return -ENOMEM;
+
+ __aad = (u8 *)aead_req + reqsize;
+ memcpy(__aad, aad, CCM_AAD_LEN);
sg_init_table(sg, 3);
- sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad));
sg_set_buf(&sg[1], data, data_len);
sg_set_buf(&sg[2], mic, mic_len);
@@ -67,7 +78,10 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0);
aead_request_set_ad(aead_req, sg[0].length);
- return crypto_aead_decrypt(aead_req);
+ err = crypto_aead_decrypt(aead_req);
+ kzfree(aead_req);
+
+ return err;
}
struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[],
diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h
index 6a73d1e4d186..fcd3254c5cf0 100644
--- a/net/mac80211/aes_ccm.h
+++ b/net/mac80211/aes_ccm.h
@@ -12,12 +12,14 @@
#include <linux/crypto.h>
+#define CCM_AAD_LEN 32
+
struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[],
size_t key_len,
size_t mic_len);
-void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
- u8 *data, size_t data_len, u8 *mic,
- size_t mic_len);
+int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic,
+ size_t mic_len);
int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic,
size_t mic_len);
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index bdf0790d89cc..d0bd5fff5f0a 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -23,7 +23,7 @@
#define AAD_LEN 20
-static void gf_mulx(u8 *pad)
+void gf_mulx(u8 *pad)
{
int i, carry;
@@ -35,9 +35,9 @@ static void gf_mulx(u8 *pad)
pad[AES_BLOCK_SIZE - 1] ^= 0x87;
}
-static void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
- const u8 *addr[], const size_t *len, u8 *mac,
- size_t mac_len)
+void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
+ const u8 *addr[], const size_t *len, u8 *mac,
+ size_t mac_len)
{
u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
const u8 *pos, *end;
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index 3702041f44fd..c827e1d5de8b 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -11,6 +11,10 @@
#include <linux/crypto.h>
+void gf_mulx(u8 *pad);
+void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
+ const u8 *addr[], const size_t *len, u8 *mac,
+ size_t mac_len);
struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
size_t key_len);
void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
diff --git a/net/mac80211/aes_gcm.c b/net/mac80211/aes_gcm.c
index 3afe361fd27c..8a4397cc1b08 100644
--- a/net/mac80211/aes_gcm.c
+++ b/net/mac80211/aes_gcm.c
@@ -15,20 +15,23 @@
#include "key.h"
#include "aes_gcm.h"
-void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
- u8 *data, size_t data_len, u8 *mic)
+int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic)
{
struct scatterlist sg[3];
+ struct aead_request *aead_req;
+ int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
+ u8 *__aad;
- char aead_req_data[sizeof(struct aead_request) +
- crypto_aead_reqsize(tfm)]
- __aligned(__alignof__(struct aead_request));
- struct aead_request *aead_req = (void *)aead_req_data;
+ aead_req = kzalloc(reqsize + GCM_AAD_LEN, GFP_ATOMIC);
+ if (!aead_req)
+ return -ENOMEM;
- memset(aead_req, 0, sizeof(aead_req_data));
+ __aad = (u8 *)aead_req + reqsize;
+ memcpy(__aad, aad, GCM_AAD_LEN);
sg_init_table(sg, 3);
- sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad));
sg_set_buf(&sg[1], data, data_len);
sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN);
@@ -37,24 +40,31 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
aead_request_set_ad(aead_req, sg[0].length);
crypto_aead_encrypt(aead_req);
+ kzfree(aead_req);
+ return 0;
}
int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic)
{
struct scatterlist sg[3];
- char aead_req_data[sizeof(struct aead_request) +
- crypto_aead_reqsize(tfm)]
- __aligned(__alignof__(struct aead_request));
- struct aead_request *aead_req = (void *)aead_req_data;
+ struct aead_request *aead_req;
+ int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
+ u8 *__aad;
+ int err;
if (data_len == 0)
return -EINVAL;
- memset(aead_req, 0, sizeof(aead_req_data));
+ aead_req = kzalloc(reqsize + GCM_AAD_LEN, GFP_ATOMIC);
+ if (!aead_req)
+ return -ENOMEM;
+
+ __aad = (u8 *)aead_req + reqsize;
+ memcpy(__aad, aad, GCM_AAD_LEN);
sg_init_table(sg, 3);
- sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad));
sg_set_buf(&sg[1], data, data_len);
sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN);
@@ -63,7 +73,10 @@ int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
data_len + IEEE80211_GCMP_MIC_LEN, j_0);
aead_request_set_ad(aead_req, sg[0].length);
- return crypto_aead_decrypt(aead_req);
+ err = crypto_aead_decrypt(aead_req);
+ kzfree(aead_req);
+
+ return err;
}
struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[],
diff --git a/net/mac80211/aes_gcm.h b/net/mac80211/aes_gcm.h
index 1347fda6b76a..55aed5352494 100644
--- a/net/mac80211/aes_gcm.h
+++ b/net/mac80211/aes_gcm.h
@@ -11,8 +11,10 @@
#include <linux/crypto.h>
-void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
- u8 *data, size_t data_len, u8 *mic);
+#define GCM_AAD_LEN 32
+
+int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic);
int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
u8 *data, size_t data_len, u8 *mic);
struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[],
diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c
index 3ddd927aaf30..bd72a862ddb7 100644
--- a/net/mac80211/aes_gmac.c
+++ b/net/mac80211/aes_gmac.c
@@ -17,28 +17,27 @@
#include "key.h"
#include "aes_gmac.h"
-#define GMAC_MIC_LEN 16
-#define GMAC_NONCE_LEN 12
-#define AAD_LEN 20
-
int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
const u8 *data, size_t data_len, u8 *mic)
{
struct scatterlist sg[4];
- char aead_req_data[sizeof(struct aead_request) +
- crypto_aead_reqsize(tfm)]
- __aligned(__alignof__(struct aead_request));
- struct aead_request *aead_req = (void *)aead_req_data;
- u8 zero[GMAC_MIC_LEN], iv[AES_BLOCK_SIZE];
+ u8 *zero, *__aad, iv[AES_BLOCK_SIZE];
+ struct aead_request *aead_req;
+ int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
if (data_len < GMAC_MIC_LEN)
return -EINVAL;
- memset(aead_req, 0, sizeof(aead_req_data));
+ aead_req = kzalloc(reqsize + GMAC_MIC_LEN + GMAC_AAD_LEN, GFP_ATOMIC);
+ if (!aead_req)
+ return -ENOMEM;
+
+ zero = (u8 *)aead_req + reqsize;
+ __aad = zero + GMAC_MIC_LEN;
+ memcpy(__aad, aad, GMAC_AAD_LEN);
- memset(zero, 0, GMAC_MIC_LEN);
sg_init_table(sg, 4);
- sg_set_buf(&sg[0], aad, AAD_LEN);
+ sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN);
sg_set_buf(&sg[2], zero, GMAC_MIC_LEN);
sg_set_buf(&sg[3], mic, GMAC_MIC_LEN);
@@ -49,9 +48,10 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
aead_request_set_tfm(aead_req, tfm);
aead_request_set_crypt(aead_req, sg, sg, 0, iv);
- aead_request_set_ad(aead_req, AAD_LEN + data_len);
+ aead_request_set_ad(aead_req, GMAC_AAD_LEN + data_len);
crypto_aead_encrypt(aead_req);
+ kzfree(aead_req);
return 0;
}
diff --git a/net/mac80211/aes_gmac.h b/net/mac80211/aes_gmac.h
index d328204d73a8..32e6442c95be 100644
--- a/net/mac80211/aes_gmac.h
+++ b/net/mac80211/aes_gmac.h
@@ -11,6 +11,10 @@
#include <linux/crypto.h>
+#define GMAC_AAD_LEN 20
+#define GMAC_MIC_LEN 16
+#define GMAC_NONCE_LEN 12
+
struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
size_t key_len);
int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index afa94687d5e1..3b5fd4188f2a 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -304,19 +304,18 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
buf_size = IEEE80211_MAX_AMPDU_BUF;
/* make sure the size doesn't exceed the maximum supported by the hw */
- if (buf_size > local->hw.max_rx_aggregation_subframes)
- buf_size = local->hw.max_rx_aggregation_subframes;
+ if (buf_size > sta->sta.max_rx_aggregation_subframes)
+ buf_size = sta->sta.max_rx_aggregation_subframes;
params.buf_size = buf_size;
+ ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n",
+ buf_size, sta->sta.addr);
+
/* examine state machine */
mutex_lock(&sta->ampdu_mlme.mtx);
if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
- tid_agg_rx = rcu_dereference_protected(
- sta->ampdu_mlme.tid_rx[tid],
- lockdep_is_held(&sta->ampdu_mlme.mtx));
-
- if (tid_agg_rx->dialog_token == dialog_token) {
+ if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
ht_dbg_ratelimited(sta->sdata,
"updated AddBA Req from %pM on tid %u\n",
sta->sta.addr, tid);
@@ -393,7 +392,6 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* update data */
- tid_agg_rx->dialog_token = dialog_token;
tid_agg_rx->ssn = start_seq_num;
tid_agg_rx->head_seq_num = start_seq_num;
tid_agg_rx->buf_size = buf_size;
@@ -412,8 +410,11 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
end:
- if (status == WLAN_STATUS_SUCCESS)
+ if (status == WLAN_STATUS_SUCCESS) {
__set_bit(tid, sta->ampdu_mlme.agg_session_valid);
+ __clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
+ sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
+ }
mutex_unlock(&sta->ampdu_mlme.mtx);
end_no_lock:
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 543b1d4fc33d..e91e503bf992 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3,6 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
+ * Copyright (C) 2015-2016 Intel Deutschland GmbH
*
* This file is GPLv2 as found in COPYING.
*/
@@ -39,7 +40,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
if (type == NL80211_IFTYPE_MONITOR && flags) {
sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- sdata->u.mntr_flags = *flags;
+ sdata->u.mntr.flags = *flags;
}
return wdev;
@@ -73,8 +74,29 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
sdata->u.mgd.use_4addr = params->use_4addr;
}
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *monitor_sdata;
+ u32 mu_mntr_cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+ monitor_sdata = rtnl_dereference(local->monitor_sdata);
+ if (monitor_sdata &&
+ wiphy_ext_feature_isset(wiphy, mu_mntr_cap_flag)) {
+ memcpy(monitor_sdata->vif.bss_conf.mu_group.membership,
+ params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN);
+ memcpy(monitor_sdata->vif.bss_conf.mu_group.position,
+ params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
+ WLAN_USER_POSITION_LEN);
+ monitor_sdata->vif.mu_mimo_owner = true;
+ ieee80211_bss_info_change_notify(monitor_sdata,
+ BSS_CHANGED_MU_GROUPS);
+
+ ether_addr_copy(monitor_sdata->u.mntr.mu_follow_addr,
+ params->macaddr);
+ }
+
+ if (!flags)
+ return 0;
if (ieee80211_sdata_running(sdata)) {
u32 mask = MONITOR_FLAG_COOK_FRAMES |
@@ -89,11 +111,11 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
* cooked_mntrs, monitor and all fif_* counters
* reconfigure hardware
*/
- if ((*flags & mask) != (sdata->u.mntr_flags & mask))
+ if ((*flags & mask) != (sdata->u.mntr.flags & mask))
return -EBUSY;
ieee80211_adjust_monitor_flags(sdata, -1);
- sdata->u.mntr_flags = *flags;
+ sdata->u.mntr.flags = *flags;
ieee80211_adjust_monitor_flags(sdata, 1);
ieee80211_configure_filter(local);
@@ -103,7 +125,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
* and ieee80211_do_open take care of "everything"
* mentioned in the comment above.
*/
- sdata->u.mntr_flags = *flags;
+ sdata->u.mntr.flags = *flags;
}
}
@@ -131,6 +153,149 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy,
ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev));
}
+static int ieee80211_start_nan(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_conf *conf)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ int ret;
+
+ mutex_lock(&sdata->local->chanctx_mtx);
+ ret = ieee80211_check_combinations(sdata, NULL, 0, 0);
+ mutex_unlock(&sdata->local->chanctx_mtx);
+ if (ret < 0)
+ return ret;
+
+ ret = ieee80211_do_open(wdev, true);
+ if (ret)
+ return ret;
+
+ ret = drv_start_nan(sdata->local, sdata, conf);
+ if (ret)
+ ieee80211_sdata_stop(sdata);
+
+ sdata->u.nan.conf = *conf;
+
+ return ret;
+}
+
+static void ieee80211_stop_nan(struct wiphy *wiphy,
+ struct wireless_dev *wdev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ drv_stop_nan(sdata->local, sdata);
+ ieee80211_sdata_stop(sdata);
+}
+
+static int ieee80211_nan_change_conf(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_conf *conf,
+ u32 changes)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct cfg80211_nan_conf new_conf;
+ int ret = 0;
+
+ if (sdata->vif.type != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ if (!ieee80211_sdata_running(sdata))
+ return -ENETDOWN;
+
+ new_conf = sdata->u.nan.conf;
+
+ if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
+ new_conf.master_pref = conf->master_pref;
+
+ if (changes & CFG80211_NAN_CONF_CHANGED_DUAL)
+ new_conf.dual = conf->dual;
+
+ ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
+ if (!ret)
+ sdata->u.nan.conf = new_conf;
+
+ return ret;
+}
+
+static int ieee80211_add_nan_func(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_func *nan_func)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ int ret;
+
+ if (sdata->vif.type != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ if (!ieee80211_sdata_running(sdata))
+ return -ENETDOWN;
+
+ spin_lock_bh(&sdata->u.nan.func_lock);
+
+ ret = idr_alloc(&sdata->u.nan.function_inst_ids,
+ nan_func, 1, sdata->local->hw.max_nan_de_entries + 1,
+ GFP_ATOMIC);
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+
+ if (ret < 0)
+ return ret;
+
+ nan_func->instance_id = ret;
+
+ WARN_ON(nan_func->instance_id == 0);
+
+ ret = drv_add_nan_func(sdata->local, sdata, nan_func);
+ if (ret) {
+ spin_lock_bh(&sdata->u.nan.func_lock);
+ idr_remove(&sdata->u.nan.function_inst_ids,
+ nan_func->instance_id);
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+ }
+
+ return ret;
+}
+
+static struct cfg80211_nan_func *
+ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata,
+ u64 cookie)
+{
+ struct cfg80211_nan_func *func;
+ int id;
+
+ lockdep_assert_held(&sdata->u.nan.func_lock);
+
+ idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) {
+ if (func->cookie == cookie)
+ return func;
+ }
+
+ return NULL;
+}
+
+static void ieee80211_del_nan_func(struct wiphy *wiphy,
+ struct wireless_dev *wdev, u64 cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct cfg80211_nan_func *func;
+ u8 instance_id = 0;
+
+ if (sdata->vif.type != NL80211_IFTYPE_NAN ||
+ !ieee80211_sdata_running(sdata))
+ return;
+
+ spin_lock_bh(&sdata->u.nan.func_lock);
+
+ func = ieee80211_find_nan_func_by_cookie(sdata, cookie);
+ if (func)
+ instance_id = func->instance_id;
+
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+
+ if (instance_id)
+ drv_del_nan_func(sdata->local, sdata, instance_id);
+}
+
static int ieee80211_set_noack_map(struct wiphy *wiphy,
struct net_device *dev,
u16 noack_map)
@@ -192,10 +357,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
mutex_lock(&local->sta_mtx);
if (mac_addr) {
- if (ieee80211_vif_is_mesh(&sdata->vif))
- sta = sta_info_get(sdata, mac_addr);
- else
- sta = sta_info_get_bss(sdata, mac_addr);
+ sta = sta_info_get_bss(sdata, mac_addr);
/*
* The ASSOC test makes sure the driver is ready to
* receive the key. When wpa_supplicant has roamed
@@ -236,6 +398,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
case NL80211_IFTYPE_UNSPECIFIED:
case NUM_NL80211_IFTYPES:
case NL80211_IFTYPE_P2P_CLIENT:
@@ -701,6 +864,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
}
sdata->needed_rx_chains = sdata->local->rx_chains;
+ sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+
mutex_lock(&local->mtx);
err = ieee80211_vif_use_channel(sdata, &params->chandef,
IEEE80211_CHANCTX_SHARED);
@@ -731,7 +896,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
vlan->vif.type);
}
- sdata->vif.bss_conf.beacon_int = params->beacon_interval;
sdata->vif.bss_conf.dtim_period = params->dtim_period;
sdata->vif.bss_conf.enable_beacon = true;
sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
@@ -1357,9 +1521,6 @@ static int ieee80211_change_station(struct wiphy *wiphy,
goto out_err;
if (params->vlan && params->vlan != sta->sdata->dev) {
- bool prev_4addr = false;
- bool new_4addr = false;
-
vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
if (params->vlan->ieee80211_ptr->use_4addr) {
@@ -1369,26 +1530,21 @@ static int ieee80211_change_station(struct wiphy *wiphy,
}
rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
- new_4addr = true;
__ieee80211_check_fast_rx_iface(vlansdata);
}
if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- sta->sdata->u.vlan.sta) {
+ sta->sdata->u.vlan.sta)
RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL);
- prev_4addr = true;
- }
+
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ ieee80211_vif_dec_num_mcast(sta->sdata);
sta->sdata = vlansdata;
ieee80211_check_fast_xmit(sta);
- if (sta->sta_state == IEEE80211_STA_AUTHORIZED &&
- prev_4addr != new_4addr) {
- if (new_4addr)
- atomic_dec(&sta->sdata->bss->num_mcast_sta);
- else
- atomic_inc(&sta->sdata->bss->num_mcast_sta);
- }
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ ieee80211_vif_inc_num_mcast(sta->sdata);
ieee80211_send_layer2_update(sta);
}
@@ -2015,6 +2171,7 @@ static int ieee80211_scan(struct wiphy *wiphy,
!(req->flags & NL80211_SCAN_FLAG_AP)))
return -EOPNOTSUPP;
break;
+ case NL80211_IFTYPE_NAN:
default:
return -EOPNOTSUPP;
}
@@ -2313,13 +2470,6 @@ int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
smps_mode == IEEE80211_SMPS_AUTOMATIC)
return 0;
- /* If no associated stations, there's no need to do anything */
- if (!atomic_read(&sdata->u.ap.num_mcast_sta)) {
- sdata->smps_mode = smps_mode;
- ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
- return 0;
- }
-
ht_dbg(sdata,
"SMPS %d requested in AP mode, sending Action frame to %d stations\n",
smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta));
@@ -2940,10 +3090,6 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
}
chanctx = container_of(conf, struct ieee80211_chanctx, conf);
- if (!chanctx) {
- err = -EBUSY;
- goto out;
- }
ch_switch.timestamp = 0;
ch_switch.device_timestamp = 0;
@@ -3360,6 +3506,63 @@ static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev,
return -ENOENT;
}
+void ieee80211_nan_func_terminated(struct ieee80211_vif *vif,
+ u8 inst_id,
+ enum nl80211_nan_func_term_reason reason,
+ gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct cfg80211_nan_func *func;
+ u64 cookie;
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_NAN))
+ return;
+
+ spin_lock_bh(&sdata->u.nan.func_lock);
+
+ func = idr_find(&sdata->u.nan.function_inst_ids, inst_id);
+ if (WARN_ON(!func)) {
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+ return;
+ }
+
+ cookie = func->cookie;
+ idr_remove(&sdata->u.nan.function_inst_ids, inst_id);
+
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+
+ cfg80211_free_nan_func(func);
+
+ cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id,
+ reason, cookie, gfp);
+}
+EXPORT_SYMBOL(ieee80211_nan_func_terminated);
+
+void ieee80211_nan_func_match(struct ieee80211_vif *vif,
+ struct cfg80211_nan_match_params *match,
+ gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct cfg80211_nan_func *func;
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_NAN))
+ return;
+
+ spin_lock_bh(&sdata->u.nan.func_lock);
+
+ func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id);
+ if (WARN_ON(!func)) {
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+ return;
+ }
+ match->cookie = func->cookie;
+
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+
+ cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp);
+}
+EXPORT_SYMBOL(ieee80211_nan_func_match);
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3445,4 +3648,9 @@ const struct cfg80211_ops mac80211_config_ops = {
.set_ap_chanwidth = ieee80211_set_ap_chanwidth,
.add_tx_ts = ieee80211_add_tx_ts,
.del_tx_ts = ieee80211_del_tx_ts,
+ .start_nan = ieee80211_start_nan,
+ .stop_nan = ieee80211_stop_nan,
+ .nan_change_conf = ieee80211_nan_change_conf,
+ .add_nan_func = ieee80211_add_nan_func,
+ .del_nan_func = ieee80211_del_nan_func,
};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 74142d07ad31..a0d901d8992e 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -231,9 +231,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
!(sta->sdata->bss && sta->sdata->bss == sdata->bss))
continue;
- if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC))
- continue;
-
max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
}
rcu_read_unlock();
@@ -274,6 +271,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
ieee80211_get_max_required_bw(sdata));
break;
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
continue;
case NL80211_IFTYPE_ADHOC:
case NL80211_IFTYPE_WDS:
@@ -646,6 +644,9 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
struct ieee80211_chanctx *curr_ctx = NULL;
int ret = 0;
+ if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN))
+ return -ENOTSUPP;
+
conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
lockdep_is_held(&local->chanctx_mtx));
@@ -718,6 +719,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
switch (sdata->vif.type) {
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
continue;
case NL80211_IFTYPE_STATION:
if (!sdata->u.mgd.associated)
@@ -980,6 +982,7 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
case NUM_NL80211_IFTYPES:
WARN_ON(1);
break;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2906c1004e1a..e02ba42ca827 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -71,138 +71,45 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
-struct aqm_info {
- struct ieee80211_local *local;
- size_t size;
- size_t len;
- unsigned char buf[0];
-};
-
-#define AQM_HDR_LEN 200
-#define AQM_HW_ENTRY_LEN 40
-#define AQM_TXQ_ENTRY_LEN 110
-
-static int aqm_open(struct inode *inode, struct file *file)
+static ssize_t aqm_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
{
- struct ieee80211_local *local = inode->i_private;
- struct ieee80211_sub_if_data *sdata;
- struct sta_info *sta;
- struct txq_info *txqi;
+ struct ieee80211_local *local = file->private_data;
struct fq *fq = &local->fq;
- struct aqm_info *info = NULL;
+ char buf[200];
int len = 0;
- int i;
-
- if (!local->ops->wake_tx_queue)
- return -EOPNOTSUPP;
-
- len += AQM_HDR_LEN;
- len += 6 * AQM_HW_ENTRY_LEN;
-
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list)
- len += AQM_TXQ_ENTRY_LEN;
- list_for_each_entry_rcu(sta, &local->sta_list, list)
- len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq);
- rcu_read_unlock();
-
- info = vmalloc(len);
- if (!info)
- return -ENOMEM;
spin_lock_bh(&local->fq.lock);
rcu_read_lock();
- file->private_data = info;
- info->local = local;
- info->size = len;
- len = 0;
-
- len += scnprintf(info->buf + len, info->size - len,
- "* hw\n"
- "access name value\n"
- "R fq_flows_cnt %u\n"
- "R fq_backlog %u\n"
- "R fq_overlimit %u\n"
- "R fq_collisions %u\n"
- "RW fq_limit %u\n"
- "RW fq_quantum %u\n",
- fq->flows_cnt,
- fq->backlog,
- fq->overlimit,
- fq->collisions,
- fq->limit,
- fq->quantum);
-
- len += scnprintf(info->buf + len,
- info->size - len,
- "* vif\n"
- "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
-
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- txqi = to_txq_info(sdata->vif.txq);
- len += scnprintf(info->buf + len, info->size - len,
- "%s %pM %u %u %u %u %u %u %u %u\n",
- sdata->name,
- sdata->vif.addr,
- txqi->txq.ac,
- txqi->tin.backlog_bytes,
- txqi->tin.backlog_packets,
- txqi->tin.flows,
- txqi->tin.overlimit,
- txqi->tin.collisions,
- txqi->tin.tx_bytes,
- txqi->tin.tx_packets);
- }
-
- len += scnprintf(info->buf + len,
- info->size - len,
- "* sta\n"
- "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
-
- list_for_each_entry_rcu(sta, &local->sta_list, list) {
- sdata = sta->sdata;
- for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- txqi = to_txq_info(sta->sta.txq[i]);
- len += scnprintf(info->buf + len, info->size - len,
- "%s %pM %d %d %u %u %u %u %u %u %u\n",
- sdata->name,
- sta->sta.addr,
- txqi->txq.tid,
- txqi->txq.ac,
- txqi->tin.backlog_bytes,
- txqi->tin.backlog_packets,
- txqi->tin.flows,
- txqi->tin.overlimit,
- txqi->tin.collisions,
- txqi->tin.tx_bytes,
- txqi->tin.tx_packets);
- }
- }
-
- info->len = len;
+ len = scnprintf(buf, sizeof(buf),
+ "access name value\n"
+ "R fq_flows_cnt %u\n"
+ "R fq_backlog %u\n"
+ "R fq_overlimit %u\n"
+ "R fq_overmemory %u\n"
+ "R fq_collisions %u\n"
+ "R fq_memory_usage %u\n"
+ "RW fq_memory_limit %u\n"
+ "RW fq_limit %u\n"
+ "RW fq_quantum %u\n",
+ fq->flows_cnt,
+ fq->backlog,
+ fq->overmemory,
+ fq->overlimit,
+ fq->collisions,
+ fq->memory_usage,
+ fq->memory_limit,
+ fq->limit,
+ fq->quantum);
rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
- return 0;
-}
-
-static int aqm_release(struct inode *inode, struct file *file)
-{
- vfree(file->private_data);
- return 0;
-}
-
-static ssize_t aqm_read(struct file *file,
- char __user *user_buf,
- size_t count,
- loff_t *ppos)
-{
- struct aqm_info *info = file->private_data;
-
return simple_read_from_buffer(user_buf, count, ppos,
- info->buf, info->len);
+ buf, len);
}
static ssize_t aqm_write(struct file *file,
@@ -210,8 +117,7 @@ static ssize_t aqm_write(struct file *file,
size_t count,
loff_t *ppos)
{
- struct aqm_info *info = file->private_data;
- struct ieee80211_local *local = info->local;
+ struct ieee80211_local *local = file->private_data;
char buf[100];
size_t len;
@@ -228,6 +134,8 @@ static ssize_t aqm_write(struct file *file,
if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1)
return count;
+ else if (sscanf(buf, "fq_memory_limit %u", &local->fq.memory_limit) == 1)
+ return count;
else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1)
return count;
@@ -237,8 +145,7 @@ static ssize_t aqm_write(struct file *file,
static const struct file_operations aqm_ops = {
.write = aqm_write,
.read = aqm_read,
- .open = aqm_open,
- .release = aqm_release,
+ .open = simple_open,
.llseek = default_llseek,
};
@@ -302,6 +209,8 @@ static const char *hw_flag_names[] = {
FLAG(USES_RSS),
FLAG(TX_AMSDU),
FLAG(TX_FRAG_LIST),
+ FLAG(REPORTS_LOW_ACK),
+ FLAG(SUPPORTS_TX_FRAG),
#undef FLAG
};
@@ -428,7 +337,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD(hwflags);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
- DEBUGFS_ADD_MODE(aqm, 0600);
+
+ if (local->ops->wake_tx_queue)
+ DEBUGFS_ADD_MODE(aqm, 0600);
statsd = debugfs_create_dir("statistics", phyd);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a5ba739cd2a7..1a05f85cb1f0 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -30,7 +30,7 @@ static ssize_t ieee80211_if_read(
size_t count, loff_t *ppos,
ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
{
- char buf[70];
+ char buf[200];
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
@@ -477,6 +477,7 @@ IEEE80211_IF_FILE_RW(tdls_wider_bw);
IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC);
IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC);
IEEE80211_IF_FILE(dtim_count, u.ap.ps.dtim_count, DEC);
+IEEE80211_IF_FILE(num_mcast_sta_vlan, u.vlan.num_mcast_sta, ATOMIC);
static ssize_t ieee80211_if_fmt_num_buffered_multicast(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -486,6 +487,38 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast(
}
IEEE80211_IF_FILE_R(num_buffered_multicast);
+static ssize_t ieee80211_if_fmt_aqm(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+ int len;
+
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ len = scnprintf(buf,
+ buflen,
+ "ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"
+ "%u %u %u %u %u %u %u %u %u %u\n",
+ txqi->txq.ac,
+ txqi->tin.backlog_bytes,
+ txqi->tin.backlog_packets,
+ txqi->tin.flows,
+ txqi->cstats.drop_count,
+ txqi->cstats.ecn_mark,
+ txqi->tin.overlimit,
+ txqi->tin.collisions,
+ txqi->tin.tx_bytes,
+ txqi->tin.tx_packets);
+
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+
+ return len;
+}
+IEEE80211_IF_FILE_R(aqm);
+
/* IBSS attributes */
static ssize_t ieee80211_if_fmt_tsf(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -524,9 +557,15 @@ static ssize_t ieee80211_if_parse_tsf(
ret = kstrtoull(buf, 10, &tsf);
if (ret < 0)
return ret;
- if (tsf_is_delta)
- tsf = drv_get_tsf(local, sdata) + tsf_is_delta * tsf;
- if (local->ops->set_tsf) {
+ if (tsf_is_delta && local->ops->offset_tsf) {
+ drv_offset_tsf(local, sdata, tsf_is_delta * tsf);
+ wiphy_info(local->hw.wiphy,
+ "debugfs offset TSF by %018lld\n",
+ tsf_is_delta * tsf);
+ } else if (local->ops->set_tsf) {
+ if (tsf_is_delta)
+ tsf = drv_get_tsf(local, sdata) +
+ tsf_is_delta * tsf;
drv_set_tsf(local, sdata, tsf);
wiphy_info(local->hw.wiphy,
"debugfs set TSF to %#018llx\n", tsf);
@@ -618,6 +657,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz);
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
DEBUGFS_ADD(hw_queues);
+
+ if (sdata->local->ops->wake_tx_queue)
+ DEBUGFS_ADD(aqm);
}
static void add_sta_files(struct ieee80211_sub_if_data *sdata)
@@ -643,6 +685,13 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
}
+static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
+{
+ /* add num_mcast_sta_vlan using name num_mcast_sta */
+ debugfs_create_file("num_mcast_sta", 0400, sdata->vif.debugfs_dir,
+ sdata, &num_mcast_sta_vlan_ops);
+}
+
static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
{
DEBUGFS_ADD_MODE(tsf, 0600);
@@ -746,6 +795,9 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
case NL80211_IFTYPE_AP:
add_ap_files(sdata);
break;
+ case NL80211_IFTYPE_AP_VLAN:
+ add_vlan_files(sdata);
+ break;
case NL80211_IFTYPE_WDS:
add_wds_files(sdata);
break;
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index fd334133ff45..f6003b8c2c33 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -133,6 +133,55 @@ static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
}
STA_OPS(last_seq_ctrl);
+#define AQM_TXQ_ENTRY_LEN 130
+
+static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_local *local = sta->local;
+ size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
+ char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+ struct txq_info *txqi;
+ ssize_t rv;
+ int i;
+
+ if (!buf)
+ return -ENOMEM;
+
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ p += scnprintf(p,
+ bufsz+buf-p,
+ "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n");
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ txqi = to_txq_info(sta->sta.txq[i]);
+ p += scnprintf(p, bufsz+buf-p,
+ "%d %d %u %u %u %u %u %u %u %u %u\n",
+ txqi->txq.tid,
+ txqi->txq.ac,
+ txqi->tin.backlog_bytes,
+ txqi->tin.backlog_packets,
+ txqi->tin.flows,
+ txqi->cstats.drop_count,
+ txqi->cstats.ecn_mark,
+ txqi->tin.overlimit,
+ txqi->tin.collisions,
+ txqi->tin.tx_bytes,
+ txqi->tin.tx_packets);
+ }
+
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return rv;
+}
+STA_OPS(aqm);
+
static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -150,13 +199,18 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
"TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ bool tid_rx_valid;
+
tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]);
+ tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid);
p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_rx);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x",
+ tid_rx_valid);
p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
- tid_rx ? tid_rx->dialog_token : 0);
+ tid_rx_valid ?
+ sta->ampdu_mlme.tid_rx_token[i] : 0);
p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
tid_rx ? tid_rx->ssn : 0);
@@ -478,6 +532,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered);
+ if (local->ops->wake_tx_queue)
+ DEBUGFS_ADD(aqm);
+
if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
debugfs_create_x32("driver_buffered_tids", 0400,
sta->debugfs_dir,
@@ -492,10 +549,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
void ieee80211_sta_debugfs_remove(struct sta_info *sta)
{
- struct ieee80211_local *local = sta->local;
- struct ieee80211_sub_if_data *sdata = sta->sdata;
-
- drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
debugfs_remove_recursive(sta->debugfs_dir);
sta->debugfs_dir = NULL;
}
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index c258f1041d33..bb886e7db47f 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -62,7 +62,7 @@ int drv_add_interface(struct ieee80211_local *local,
if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
(sdata->vif.type == NL80211_IFTYPE_MONITOR &&
!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
- !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))))
+ !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))))
return -EINVAL;
trace_drv_add_interface(local, sdata);
@@ -215,6 +215,21 @@ void drv_set_tsf(struct ieee80211_local *local,
trace_drv_return_void(local);
}
+void drv_offset_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ s64 offset)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_offset_tsf(local, sdata, offset);
+ if (local->ops->offset_tsf)
+ local->ops->offset_tsf(&local->hw, &sdata->vif, offset);
+ trace_drv_return_void(local);
+}
+
void drv_reset_tsf(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata)
{
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index ba5fc1f01e53..09f77e4a8a79 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -162,7 +162,9 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
return;
if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
- sdata->vif.type == NL80211_IFTYPE_MONITOR))
+ sdata->vif.type == NL80211_IFTYPE_NAN ||
+ (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !sdata->vif.mu_mimo_owner)))
return;
if (!check_sdata_in_driver(sdata))
@@ -498,21 +500,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
sta, dir);
}
-
-static inline void drv_sta_remove_debugfs(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- struct ieee80211_sta *sta,
- struct dentry *dir)
-{
- might_sleep();
-
- sdata = get_bss_sdata(sdata);
- check_sdata_in_driver(sdata);
-
- if (local->ops->sta_remove_debugfs)
- local->ops->sta_remove_debugfs(&local->hw, &sdata->vif,
- sta, dir);
-}
#endif
static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
@@ -582,6 +569,9 @@ u64 drv_get_tsf(struct ieee80211_local *local,
void drv_set_tsf(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
u64 tsf);
+void drv_offset_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ s64 offset);
void drv_reset_tsf(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
@@ -1088,13 +1078,13 @@ static inline void drv_leave_ibss(struct ieee80211_local *local,
}
static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
- struct ieee80211_sta *sta)
+ struct sta_info *sta)
{
u32 ret = 0;
- trace_drv_get_expected_throughput(sta);
- if (local->ops->get_expected_throughput)
- ret = local->ops->get_expected_throughput(&local->hw, sta);
+ trace_drv_get_expected_throughput(&sta->sta);
+ if (local->ops->get_expected_throughput && sta->uploaded)
+ ret = local->ops->get_expected_throughput(&local->hw, &sta->sta);
trace_drv_return_u32(local, ret);
return ret;
@@ -1179,4 +1169,83 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
local->ops->wake_tx_queue(&local->hw, &txq->txq);
}
+static inline int drv_start_nan(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_nan_conf *conf)
+{
+ int ret;
+
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ trace_drv_start_nan(local, sdata, conf);
+ ret = local->ops->start_nan(&local->hw, &sdata->vif, conf);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_stop_nan(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ trace_drv_stop_nan(local, sdata);
+ local->ops->stop_nan(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_nan_change_conf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_nan_conf *conf,
+ u32 changes)
+{
+ int ret;
+
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ if (!local->ops->nan_change_conf)
+ return -EOPNOTSUPP;
+
+ trace_drv_nan_change_conf(local, sdata, conf, changes);
+ ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf,
+ changes);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline int drv_add_nan_func(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_nan_func *nan_func)
+{
+ int ret;
+
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ if (!local->ops->add_nan_func)
+ return -EOPNOTSUPP;
+
+ trace_drv_add_nan_func(local, sdata, nan_func);
+ ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_del_nan_func(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u8 instance_id)
+{
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ trace_drv_del_nan_func(local, sdata, instance_id);
+ if (local->ops->del_nan_func)
+ local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id);
+ trace_drv_return_void(local);
+}
+
#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c
new file mode 100644
index 000000000000..5c3af5eb4052
--- /dev/null
+++ b/net/mac80211/fils_aead.c
@@ -0,0 +1,342 @@
+/*
+ * FILS AEAD for (Re)Association Request/Response frames
+ * Copyright 2016, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/skcipher.h>
+
+#include "ieee80211_i.h"
+#include "aes_cmac.h"
+#include "fils_aead.h"
+
+static int aes_s2v(struct crypto_cipher *tfm,
+ size_t num_elem, const u8 *addr[], size_t len[], u8 *v)
+{
+ u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE];
+ size_t i;
+ const u8 *data[2];
+ size_t data_len[2], data_elems;
+
+ /* D = AES-CMAC(K, <zero>) */
+ memset(tmp, 0, AES_BLOCK_SIZE);
+ data[0] = tmp;
+ data_len[0] = AES_BLOCK_SIZE;
+ aes_cmac_vector(tfm, 1, data, data_len, d, AES_BLOCK_SIZE);
+
+ for (i = 0; i < num_elem - 1; i++) {
+ /* D = dbl(D) xor AES_CMAC(K, Si) */
+ gf_mulx(d); /* dbl */
+ aes_cmac_vector(tfm, 1, &addr[i], &len[i], tmp,
+ AES_BLOCK_SIZE);
+ crypto_xor(d, tmp, AES_BLOCK_SIZE);
+ }
+
+ if (len[i] >= AES_BLOCK_SIZE) {
+ /* len(Sn) >= 128 */
+ size_t j;
+ const u8 *pos;
+
+ /* T = Sn xorend D */
+
+ /* Use a temporary buffer to perform xorend on Sn (addr[i]) to
+ * avoid modifying the const input argument.
+ */
+ data[0] = addr[i];
+ data_len[0] = len[i] - AES_BLOCK_SIZE;
+ pos = addr[i] + data_len[0];
+ for (j = 0; j < AES_BLOCK_SIZE; j++)
+ tmp[j] = pos[j] ^ d[j];
+ data[1] = tmp;
+ data_len[1] = AES_BLOCK_SIZE;
+ data_elems = 2;
+ } else {
+ /* len(Sn) < 128 */
+ /* T = dbl(D) xor pad(Sn) */
+ gf_mulx(d); /* dbl */
+ memset(tmp, 0, AES_BLOCK_SIZE);
+ memcpy(tmp, addr[i], len[i]);
+ tmp[len[i]] = 0x80;
+ crypto_xor(d, tmp, AES_BLOCK_SIZE);
+ data[0] = d;
+ data_len[0] = sizeof(d);
+ data_elems = 1;
+ }
+ /* V = AES-CMAC(K, T) */
+ aes_cmac_vector(tfm, data_elems, data, data_len, v, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+/* Note: addr[] and len[] needs to have one extra slot at the end. */
+static int aes_siv_encrypt(const u8 *key, size_t key_len,
+ const u8 *plain, size_t plain_len,
+ size_t num_elem, const u8 *addr[],
+ size_t len[], u8 *out)
+{
+ u8 v[AES_BLOCK_SIZE];
+ struct crypto_cipher *tfm;
+ struct crypto_skcipher *tfm2;
+ struct skcipher_request *req;
+ int res;
+ struct scatterlist src[1], dst[1];
+ u8 *tmp;
+
+ key_len /= 2; /* S2V key || CTR key */
+
+ addr[num_elem] = plain;
+ len[num_elem] = plain_len;
+ num_elem++;
+
+ /* S2V */
+
+ tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+ /* K1 for S2V */
+ res = crypto_cipher_setkey(tfm, key, key_len);
+ if (!res)
+ res = aes_s2v(tfm, num_elem, addr, len, v);
+ crypto_free_cipher(tfm);
+ if (res)
+ return res;
+
+ /* Use a temporary buffer of the plaintext to handle need for
+ * overwriting this during AES-CTR.
+ */
+ tmp = kmemdup(plain, plain_len, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ /* IV for CTR before encrypted data */
+ memcpy(out, v, AES_BLOCK_SIZE);
+
+ /* Synthetic IV to be used as the initial counter in CTR:
+ * Q = V bitand (1^64 || 0^1 || 1^31 || 0^1 || 1^31)
+ */
+ v[8] &= 0x7f;
+ v[12] &= 0x7f;
+
+ /* CTR */
+
+ tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm2)) {
+ kfree(tmp);
+ return PTR_ERR(tfm2);
+ }
+ /* K2 for CTR */
+ res = crypto_skcipher_setkey(tfm2, key + key_len, key_len);
+ if (res)
+ goto fail;
+
+ req = skcipher_request_alloc(tfm2, GFP_KERNEL);
+ if (!req) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ sg_init_one(src, tmp, plain_len);
+ sg_init_one(dst, out + AES_BLOCK_SIZE, plain_len);
+ skcipher_request_set_crypt(req, src, dst, plain_len, v);
+ res = crypto_skcipher_encrypt(req);
+ skcipher_request_free(req);
+fail:
+ kfree(tmp);
+ crypto_free_skcipher(tfm2);
+ return res;
+}
+
+/* Note: addr[] and len[] needs to have one extra slot at the end. */
+static int aes_siv_decrypt(const u8 *key, size_t key_len,
+ const u8 *iv_crypt, size_t iv_c_len,
+ size_t num_elem, const u8 *addr[], size_t len[],
+ u8 *out)
+{
+ struct crypto_cipher *tfm;
+ struct crypto_skcipher *tfm2;
+ struct skcipher_request *req;
+ struct scatterlist src[1], dst[1];
+ size_t crypt_len;
+ int res;
+ u8 frame_iv[AES_BLOCK_SIZE], iv[AES_BLOCK_SIZE];
+ u8 check[AES_BLOCK_SIZE];
+
+ crypt_len = iv_c_len - AES_BLOCK_SIZE;
+ key_len /= 2; /* S2V key || CTR key */
+ addr[num_elem] = out;
+ len[num_elem] = crypt_len;
+ num_elem++;
+
+ memcpy(iv, iv_crypt, AES_BLOCK_SIZE);
+ memcpy(frame_iv, iv_crypt, AES_BLOCK_SIZE);
+
+ /* Synthetic IV to be used as the initial counter in CTR:
+ * Q = V bitand (1^64 || 0^1 || 1^31 || 0^1 || 1^31)
+ */
+ iv[8] &= 0x7f;
+ iv[12] &= 0x7f;
+
+ /* CTR */
+
+ tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm2))
+ return PTR_ERR(tfm2);
+ /* K2 for CTR */
+ res = crypto_skcipher_setkey(tfm2, key + key_len, key_len);
+ if (res) {
+ crypto_free_skcipher(tfm2);
+ return res;
+ }
+
+ req = skcipher_request_alloc(tfm2, GFP_KERNEL);
+ if (!req) {
+ crypto_free_skcipher(tfm2);
+ return -ENOMEM;
+ }
+
+ sg_init_one(src, iv_crypt + AES_BLOCK_SIZE, crypt_len);
+ sg_init_one(dst, out, crypt_len);
+ skcipher_request_set_crypt(req, src, dst, crypt_len, iv);
+ res = crypto_skcipher_decrypt(req);
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm2);
+ if (res)
+ return res;
+
+ /* S2V */
+
+ tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+ /* K1 for S2V */
+ res = crypto_cipher_setkey(tfm, key, key_len);
+ if (!res)
+ res = aes_s2v(tfm, num_elem, addr, len, check);
+ crypto_free_cipher(tfm);
+ if (res)
+ return res;
+ if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0)
+ return -EINVAL;
+ return 0;
+}
+
+int fils_encrypt_assoc_req(struct sk_buff *skb,
+ struct ieee80211_mgd_assoc_data *assoc_data)
+{
+ struct ieee80211_mgmt *mgmt = (void *)skb->data;
+ u8 *capab, *ies, *encr;
+ const u8 *addr[5 + 1], *session;
+ size_t len[5 + 1];
+ size_t crypt_len;
+
+ if (ieee80211_is_reassoc_req(mgmt->frame_control)) {
+ capab = (u8 *)&mgmt->u.reassoc_req.capab_info;
+ ies = mgmt->u.reassoc_req.variable;
+ } else {
+ capab = (u8 *)&mgmt->u.assoc_req.capab_info;
+ ies = mgmt->u.assoc_req.variable;
+ }
+
+ session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION,
+ ies, skb->data + skb->len - ies);
+ if (!session || session[1] != 1 + 8)
+ return -EINVAL;
+ /* encrypt after FILS Session element */
+ encr = (u8 *)session + 2 + 1 + 8;
+
+ /* AES-SIV AAD vectors */
+
+ /* The STA's MAC address */
+ addr[0] = mgmt->sa;
+ len[0] = ETH_ALEN;
+ /* The AP's BSSID */
+ addr[1] = mgmt->da;
+ len[1] = ETH_ALEN;
+ /* The STA's nonce */
+ addr[2] = assoc_data->fils_nonces;
+ len[2] = FILS_NONCE_LEN;
+ /* The AP's nonce */
+ addr[3] = &assoc_data->fils_nonces[FILS_NONCE_LEN];
+ len[3] = FILS_NONCE_LEN;
+ /* The (Re)Association Request frame from the Capability Information
+ * field to the FILS Session element (both inclusive).
+ */
+ addr[4] = capab;
+ len[4] = encr - capab;
+
+ crypt_len = skb->data + skb->len - encr;
+ skb_put(skb, AES_BLOCK_SIZE);
+ return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
+ encr, crypt_len, 5, addr, len, encr);
+}
+
+int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
+ u8 *frame, size_t *frame_len,
+ struct ieee80211_mgd_assoc_data *assoc_data)
+{
+ struct ieee80211_mgmt *mgmt = (void *)frame;
+ u8 *capab, *ies, *encr;
+ const u8 *addr[5 + 1], *session;
+ size_t len[5 + 1];
+ int res;
+ size_t crypt_len;
+
+ if (*frame_len < 24 + 6)
+ return -EINVAL;
+
+ capab = (u8 *)&mgmt->u.assoc_resp.capab_info;
+ ies = mgmt->u.assoc_resp.variable;
+ session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION,
+ ies, frame + *frame_len - ies);
+ if (!session || session[1] != 1 + 8) {
+ mlme_dbg(sdata,
+ "No (valid) FILS Session element in (Re)Association Response frame from %pM",
+ mgmt->sa);
+ return -EINVAL;
+ }
+ /* decrypt after FILS Session element */
+ encr = (u8 *)session + 2 + 1 + 8;
+
+ /* AES-SIV AAD vectors */
+
+ /* The AP's BSSID */
+ addr[0] = mgmt->sa;
+ len[0] = ETH_ALEN;
+ /* The STA's MAC address */
+ addr[1] = mgmt->da;
+ len[1] = ETH_ALEN;
+ /* The AP's nonce */
+ addr[2] = &assoc_data->fils_nonces[FILS_NONCE_LEN];
+ len[2] = FILS_NONCE_LEN;
+ /* The STA's nonce */
+ addr[3] = assoc_data->fils_nonces;
+ len[3] = FILS_NONCE_LEN;
+ /* The (Re)Association Response frame from the Capability Information
+ * field to the FILS Session element (both inclusive).
+ */
+ addr[4] = capab;
+ len[4] = encr - capab;
+
+ crypt_len = frame + *frame_len - encr;
+ if (crypt_len < AES_BLOCK_SIZE) {
+ mlme_dbg(sdata,
+ "Not enough room for AES-SIV data after FILS Session element in (Re)Association Response frame from %pM",
+ mgmt->sa);
+ return -EINVAL;
+ }
+ res = aes_siv_decrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
+ encr, crypt_len, 5, addr, len, encr);
+ if (res != 0) {
+ mlme_dbg(sdata,
+ "AES-SIV decryption of (Re)Association Response frame from %pM failed",
+ mgmt->sa);
+ return res;
+ }
+ *frame_len -= AES_BLOCK_SIZE;
+ return 0;
+}
diff --git a/net/mac80211/fils_aead.h b/net/mac80211/fils_aead.h
new file mode 100644
index 000000000000..fbc65232f0b3
--- /dev/null
+++ b/net/mac80211/fils_aead.h
@@ -0,0 +1,19 @@
+/*
+ * FILS AEAD for (Re)Association Request/Response frames
+ * Copyright 2016, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef FILS_AEAD_H
+#define FILS_AEAD_H
+
+int fils_encrypt_assoc_req(struct sk_buff *skb,
+ struct ieee80211_mgd_assoc_data *assoc_data);
+int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
+ u8 *frame, size_t *frame_len,
+ struct ieee80211_mgd_assoc_data *assoc_data);
+
+#endif /* FILS_AEAD_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342c31b8..b2069fbd60f9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -3,7 +3,7 @@
* Copyright 2005, Devicescape Software, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
- * Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2013-2015 Intel Mobile Communications GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -84,8 +84,12 @@ struct ieee80211_local;
#define IEEE80211_DEFAULT_MAX_SP_LEN \
IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL
+extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
+
#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */)
+#define IEEE80211_MAX_NAN_INSTANCE_ID 255
+
struct ieee80211_fragment_entry {
struct sk_buff_head skb_list;
unsigned long first_frag_time;
@@ -155,7 +159,7 @@ enum ieee80211_bss_valid_data_flags {
IEEE80211_BSS_VALID_ERP = BIT(3)
};
-typedef unsigned __bitwise__ ieee80211_tx_result;
+typedef unsigned __bitwise ieee80211_tx_result;
#define TX_CONTINUE ((__force ieee80211_tx_result) 0u)
#define TX_DROP ((__force ieee80211_tx_result) 1u)
#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
@@ -176,7 +180,7 @@ struct ieee80211_tx_data {
};
-typedef unsigned __bitwise__ ieee80211_rx_result;
+typedef unsigned __bitwise ieee80211_rx_result;
#define RX_CONTINUE ((__force ieee80211_rx_result) 0u)
#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u)
#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
@@ -305,6 +309,7 @@ struct ieee80211_if_vlan {
/* used for all tx if the VLAN is configured to 4-addr mode */
struct sta_info __rcu *sta;
+ atomic_t num_mcast_sta; /* number of stations receiving multicast */
};
struct mesh_stats {
@@ -396,6 +401,10 @@ struct ieee80211_mgd_assoc_data {
struct ieee80211_vht_cap ap_vht_cap;
+ u8 fils_nonces[2 * FILS_NONCE_LEN];
+ u8 fils_kek[FILS_MAX_KEK_LEN];
+ size_t fils_kek_len;
+
size_t ie_len;
u8 ie[];
};
@@ -440,7 +449,7 @@ struct ieee80211_if_managed {
struct ieee80211_mgd_auth_data *auth_data;
struct ieee80211_mgd_assoc_data *assoc_data;
- u8 bssid[ETH_ALEN];
+ u8 bssid[ETH_ALEN] __aligned(2);
u16 aid;
@@ -813,17 +822,39 @@ enum txq_info_flags {
* @def_flow: used as a fallback flow when a packet destined to @tin hashes to
* a fq_flow which is already owned by a different tin
* @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
*/
struct txq_info {
struct fq_tin tin;
struct fq_flow def_flow;
struct codel_vars def_cvars;
+ struct codel_stats cstats;
+ struct sk_buff_head frags;
unsigned long flags;
/* keep last! */
struct ieee80211_txq txq;
};
+struct ieee80211_if_mntr {
+ u32 flags;
+ u8 mu_follow_addr[ETH_ALEN] __aligned(2);
+};
+
+/**
+ * struct ieee80211_if_nan - NAN state
+ *
+ * @conf: current NAN configuration
+ * @func_ids: a bitmap of available instance_id's
+ */
+struct ieee80211_if_nan {
+ struct cfg80211_nan_conf conf;
+
+ /* protects function_inst_ids */
+ spinlock_t func_lock;
+ struct idr function_inst_ids;
+};
+
struct ieee80211_sub_if_data {
struct list_head list;
@@ -922,7 +953,8 @@ struct ieee80211_sub_if_data {
struct ieee80211_if_ibss ibss;
struct ieee80211_if_mesh mesh;
struct ieee80211_if_ocb ocb;
- u32 mntr_flags;
+ struct ieee80211_if_mntr mntr;
+ struct ieee80211_if_nan nan;
} u;
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -1112,7 +1144,6 @@ struct ieee80211_local {
struct fq fq;
struct codel_vars *cvars;
struct codel_params cparams;
- struct codel_stats cstats;
const struct ieee80211_ops *ops;
@@ -1208,7 +1239,7 @@ struct ieee80211_local {
spinlock_t tim_lock;
unsigned long num_sta;
struct list_head sta_list;
- struct rhashtable sta_hash;
+ struct rhltable sta_hash;
struct timer_list sta_cleanup;
int sta_generation;
@@ -1476,6 +1507,13 @@ static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
return container_of(txq, struct txq_info, txq);
}
+static inline bool txq_has_queue(struct ieee80211_txq *txq)
+{
+ struct txq_info *txqi = to_txq_info(txq);
+
+ return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
+}
+
static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
{
return ether_addr_equal(raddr, addr) ||
@@ -1496,6 +1534,23 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
return false;
}
+void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata);
+void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata);
+
+/* This function returns the number of multicast stations connected to this
+ * interface. It returns -1 if that number is not tracked, that is for netdevs
+ * not in AP or AP_VLAN mode or when using 4addr.
+ */
+static inline int
+ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ return atomic_read(&sdata->u.ap.num_mcast_sta);
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
+ return atomic_read(&sdata->u.vlan.num_mcast_sta);
+ return -1;
+}
+
u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
struct ieee80211_rx_status *status,
unsigned int mpdu_len,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b123a9e325b3..d37ae7dc114b 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -6,6 +6,7 @@
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -43,6 +44,8 @@
* by either the RTNL, the iflist_mtx or RCU.
*/
+static void ieee80211_iface_work(struct work_struct *work);
+
bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_chanctx_conf *chanctx_conf;
@@ -148,15 +151,6 @@ void ieee80211_recalc_idle(struct ieee80211_local *local)
ieee80211_hw_config(local, change);
}
-static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 256 || new_mtu > IEEE80211_MAX_DATA_LEN)
- return -EINVAL;
-
- dev->mtu = new_mtu;
- return 0;
-}
-
static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
bool check_dup)
{
@@ -188,7 +182,7 @@ static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
continue;
if (iter->vif.type == NL80211_IFTYPE_MONITOR &&
- !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE))
continue;
m = iter->vif.addr;
@@ -217,7 +211,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
return -EBUSY;
if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
- !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
check_dup = false;
ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup);
@@ -325,6 +319,9 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
int n_queues = sdata->local->hw.queues;
int i;
+ if (iftype == NL80211_IFTYPE_NAN)
+ return 0;
+
if (iftype != NL80211_IFTYPE_P2P_DEVICE) {
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
if (WARN_ON_ONCE(sdata->vif.hw_queue[i] ==
@@ -357,7 +354,7 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
const int offset)
{
struct ieee80211_local *local = sdata->local;
- u32 flags = sdata->u.mntr_flags;
+ u32 flags = sdata->u.mntr.flags;
#define ADJUST(_f, _s) do { \
if (flags & MONITOR_FLAG_##_f) \
@@ -448,6 +445,9 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
return ret;
}
+ skb_queue_head_init(&sdata->skb_queue);
+ INIT_WORK(&sdata->work, ieee80211_iface_work);
+
return 0;
}
@@ -540,6 +540,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
case NL80211_IFTYPE_ADHOC:
case NL80211_IFTYPE_P2P_DEVICE:
case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_NAN:
/* no special treatment */
break;
case NL80211_IFTYPE_UNSPECIFIED:
@@ -589,12 +590,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
}
break;
case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
local->cooked_mntrs++;
break;
}
- if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
res = drv_add_interface(local, sdata);
if (res)
goto err_stop;
@@ -641,7 +642,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
local->fif_probe_req++;
}
- if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE)
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_NAN)
changed |= ieee80211_reset_erp_info(sdata);
ieee80211_bss_info_change_notify(sdata, changed);
@@ -655,6 +657,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
break;
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
break;
default:
/* not reached */
@@ -787,6 +790,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
struct ps_data *ps;
struct cfg80211_chan_def chandef;
bool cancel_scan;
+ struct cfg80211_nan_func *func;
clear_bit(SDATA_STATE_RUNNING, &sdata->state);
@@ -926,7 +930,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
/* no need to tell driver */
break;
case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
local->cooked_mntrs--;
break;
}
@@ -939,6 +943,18 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
ieee80211_adjust_monitor_flags(sdata, -1);
break;
+ case NL80211_IFTYPE_NAN:
+ /* clean all the functions */
+ spin_lock_bh(&sdata->u.nan.func_lock);
+
+ idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, i) {
+ idr_remove(&sdata->u.nan.function_inst_ids, i);
+ cfg80211_free_nan_func(func);
+ }
+ idr_destroy(&sdata->u.nan.function_inst_ids);
+
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+ break;
case NL80211_IFTYPE_P2P_DEVICE:
/* relies on synchronize_rcu() below */
RCU_INIT_POINTER(local->p2p_sdata, NULL);
@@ -1012,7 +1028,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
ieee80211_recalc_idle(local);
mutex_unlock(&local->mtx);
- if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
break;
/* fall through */
@@ -1142,7 +1158,6 @@ static const struct net_device_ops ieee80211_dataif_ops = {
.ndo_uninit = ieee80211_uninit,
.ndo_start_xmit = ieee80211_subif_start_xmit,
.ndo_set_rx_mode = ieee80211_set_multicast_list,
- .ndo_change_mtu = ieee80211_change_mtu,
.ndo_set_mac_address = ieee80211_change_mac,
.ndo_select_queue = ieee80211_netdev_select_queue,
.ndo_get_stats64 = ieee80211_get_stats64,
@@ -1176,7 +1191,6 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
.ndo_uninit = ieee80211_uninit,
.ndo_start_xmit = ieee80211_monitor_start_xmit,
.ndo_set_rx_mode = ieee80211_set_multicast_list,
- .ndo_change_mtu = ieee80211_change_mtu,
.ndo_set_mac_address = ieee80211_change_mac,
.ndo_select_queue = ieee80211_monitor_select_queue,
.ndo_get_stats64 = ieee80211_get_stats64,
@@ -1282,6 +1296,26 @@ static void ieee80211_iface_work(struct work_struct *work)
} else if (ieee80211_is_action(mgmt->frame_control) &&
mgmt->u.action.category == WLAN_CATEGORY_VHT) {
switch (mgmt->u.action.u.vht_group_notif.action_code) {
+ case WLAN_VHT_ACTION_OPMODE_NOTIF: {
+ struct ieee80211_rx_status *status;
+ enum nl80211_band band;
+ u8 opmode;
+
+ status = IEEE80211_SKB_RXCB(skb);
+ band = status->band;
+ opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, mgmt->sa);
+
+ if (sta)
+ ieee80211_vht_handle_opmode(sdata, sta,
+ opmode,
+ band);
+
+ mutex_unlock(&local->sta_mtx);
+ break;
+ }
case WLAN_VHT_ACTION_GROUPID_MGMT:
ieee80211_process_mu_groups(sdata, mgmt);
break;
@@ -1444,12 +1478,17 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_MONITOR:
sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
- sdata->u.mntr_flags = MONITOR_FLAG_CONTROL |
+ sdata->u.mntr.flags = MONITOR_FLAG_CONTROL |
MONITOR_FLAG_OTHER_BSS;
break;
case NL80211_IFTYPE_WDS:
sdata->vif.bss_conf.bssid = NULL;
break;
+ case NL80211_IFTYPE_NAN:
+ idr_init(&sdata->u.nan.function_inst_ids);
+ spin_lock_init(&sdata->u.nan.func_lock);
+ sdata->vif.bss_conf.bssid = sdata->vif.addr;
+ break;
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_P2P_DEVICE:
sdata->vif.bss_conf.bssid = sdata->vif.addr;
@@ -1717,7 +1756,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
ASSERT_RTNL();
- if (type == NL80211_IFTYPE_P2P_DEVICE) {
+ if (type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN) {
struct wireless_dev *wdev;
sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size,
@@ -1855,6 +1894,10 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);
+ /* MTU range: 256 - 2304 */
+ ndev->min_mtu = 256;
+ ndev->max_mtu = IEEE80211_MAX_DATA_LEN;
+
ret = register_netdevice(ndev);
if (ret) {
ieee80211_if_free(ndev);
@@ -1976,3 +2019,19 @@ void ieee80211_iface_exit(void)
{
unregister_netdevice_notifier(&mac80211_netdev_notifier);
}
+
+void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ atomic_inc(&sdata->u.ap.num_mcast_sta);
+ else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ atomic_inc(&sdata->u.vlan.num_mcast_sta);
+}
+
+void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ atomic_dec(&sdata->u.ap.num_mcast_sta);
+ else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ atomic_dec(&sdata->u.vlan.num_mcast_sta);
+}
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index edd6f2945f69..a98fc2b5e0dc 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -265,7 +265,8 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
if (uni) {
rcu_assign_pointer(sdata->default_unicast_key, key);
ieee80211_check_fast_xmit_iface(sdata);
- drv_set_default_unicast_key(sdata->local, sdata, idx);
+ if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
+ drv_set_default_unicast_key(sdata->local, sdata, idx);
}
if (multi)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d00ea9b13f49..56fb47953b72 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -549,6 +549,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
NL80211_FEATURE_MAC_ON_CREATE |
NL80211_FEATURE_USERSPACE_MPM |
NL80211_FEATURE_FULL_AP_CLIENT_STATE;
+ wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA);
if (!ops->hw_scan)
wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
@@ -660,6 +661,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
ieee80211_roc_setup(local);
+ local->hw.radiotap_timestamp.units_pos = -1;
+ local->hw.radiotap_timestamp.accuracy = -1;
+
return &local->hw;
err_free:
wiphy_free(wiphy);
@@ -818,6 +822,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
!local->ops->tdls_recv_channel_switch))
return -EOPNOTSUPP;
+ if (WARN_ON(ieee80211_hw_check(hw, SUPPORTS_TX_FRAG) &&
+ !local->ops->set_frag_threshold))
+ return -EINVAL;
+
+ if (WARN_ON(local->hw.wiphy->interface_modes &
+ BIT(NL80211_IFTYPE_NAN) &&
+ (!local->ops->start_nan || !local->ops->stop_nan)))
+ return -EINVAL;
+
#ifdef CONFIG_PM
if (hw->wiphy->wowlan && (!local->ops->suspend || !local->ops->resume))
return -EINVAL;
@@ -900,12 +913,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
supp_ht = supp_ht || sband->ht_cap.ht_supported;
supp_vht = supp_vht || sband->vht_cap.vht_supported;
- if (sband->ht_cap.ht_supported)
- local->rx_chains =
- max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
- local->rx_chains);
+ if (!sband->ht_cap.ht_supported)
+ continue;
/* TODO: consider VHT for RX chains, hopefully it's the same */
+ local->rx_chains =
+ max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
+ local->rx_chains);
+
+ /* no need to mask, SM_PS_DISABLED has all bits set */
+ sband->ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
+ IEEE80211_HT_CAP_SM_PS_SHIFT;
}
/* if low-level driver supports AP, we also support VLAN */
@@ -1055,6 +1073,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
local->dynamic_ps_forced_timeout = -1;
+ if (!local->hw.max_nan_de_entries)
+ local->hw.max_nan_de_entries = IEEE80211_MAX_NAN_INSTANCE_ID;
+
result = ieee80211_wep_init(local);
if (result < 0)
wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 42120d965263..50e1b7f78bd4 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -339,7 +339,7 @@ int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
/* fast-forward to vendor IEs */
offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0);
- if (offset) {
+ if (offset < ifmsh->ie_len) {
len = ifmsh->ie_len - offset;
data = ifmsh->ie + offset;
if (skb_tailroom(skb) < len)
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index faccef977670..b747c9645e43 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -326,22 +326,33 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local,
u32 tx_time, estimated_retx;
u64 result;
- if (sta->mesh->fail_avg >= 100)
- return MAX_METRIC;
+ /* Try to get rate based on HW/SW RC algorithm.
+ * Rate is returned in units of Kbps, correct this
+ * to comply with airtime calculation units
+ * Round up in case we get rate < 100Kbps
+ */
+ rate = DIV_ROUND_UP(sta_get_expected_throughput(sta), 100);
- sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo);
- rate = cfg80211_calculate_bitrate(&rinfo);
- if (WARN_ON(!rate))
- return MAX_METRIC;
+ if (rate) {
+ err = 0;
+ } else {
+ if (sta->mesh->fail_avg >= 100)
+ return MAX_METRIC;
- err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100;
+ sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo);
+ rate = cfg80211_calculate_bitrate(&rinfo);
+ if (WARN_ON(!rate))
+ return MAX_METRIC;
+
+ err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100;
+ }
/* bitrate is in units of 100 Kbps, while we need rate in units of
* 1Mbps. This will be corrected on tx_time computation.
*/
tx_time = (device_constant + 10 * test_frame_len / rate);
estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err));
- result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ;
+ result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT);
return (u32)result;
}
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index 64bc22ad9496..faca22cd02b5 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -28,7 +28,7 @@
* could be, for instance, in case a neighbor is restarted and its TSF counter
* reset.
*/
-#define TOFFSET_MAXIMUM_ADJUSTMENT 30000 /* 30 ms */
+#define TOFFSET_MAXIMUM_ADJUSTMENT 800 /* 0.8 ms */
struct sync_method {
u8 method;
@@ -70,9 +70,13 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
}
spin_unlock_bh(&ifmsh->sync_offset_lock);
- tsf = drv_get_tsf(local, sdata);
- if (tsf != -1ULL)
- drv_set_tsf(local, sdata, tsf + tsfdelta);
+ if (local->ops->offset_tsf) {
+ drv_offset_tsf(local, sdata, tsfdelta);
+ } else {
+ tsf = drv_get_tsf(local, sdata);
+ if (tsf != -1ULL)
+ drv_set_tsf(local, sdata, tsf + tsfdelta);
+ }
}
static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8d426f637f58..098ce9b179ee 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -30,6 +30,7 @@
#include "driver-ops.h"
#include "rate.h"
#include "led.h"
+#include "fils_aead.h"
#define IEEE80211_AUTH_TIMEOUT (HZ / 5)
#define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2)
@@ -652,6 +653,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
2 + sizeof(struct ieee80211_ht_cap) + /* HT */
2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
assoc_data->ie_len + /* extra IEs */
+ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
9, /* WMM */
GFP_KERNEL);
if (!skb)
@@ -875,6 +877,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
memcpy(pos, assoc_data->ie + offset, noffset - offset);
}
+ if (assoc_data->fils_kek_len &&
+ fils_encrypt_assoc_req(skb, assoc_data) < 0) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
drv_mgd_prepare_tx(local, sdata);
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
@@ -1672,11 +1680,15 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
non_acm_ac++)
if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac)))
break;
- /* The loop will result in using BK even if it requires
- * admission control, such configuration makes no sense
- * and we have to transmit somehow - the AC selection
- * does the same thing.
+ /* Usually the loop will result in using BK even if it
+ * requires admission control, but such a configuration
+ * makes no sense and we have to transmit somehow - the
+ * AC selection does the same thing.
+ * If we started out trying to downgrade from BK, then
+ * the extra condition here might be needed.
*/
+ if (non_acm_ac >= IEEE80211_NUM_ACS)
+ non_acm_ac = IEEE80211_AC_BK;
if (drv_conf_tx(local, sdata, ac,
&sdata->tx_conf[non_acm_ac]))
sdata_err(sdata,
@@ -2506,7 +2518,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
}
static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
- bool assoc)
+ bool assoc, bool abandon)
{
struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
@@ -2529,6 +2541,9 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
mutex_lock(&sdata->local->mtx);
ieee80211_vif_release_channel(sdata);
mutex_unlock(&sdata->local->mtx);
+
+ if (abandon)
+ cfg80211_abandon_assoc(sdata->dev, assoc_data->bss);
}
kfree(assoc_data);
@@ -2614,6 +2629,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
case WLAN_AUTH_LEAP:
case WLAN_AUTH_FT:
case WLAN_AUTH_SAE:
+ case WLAN_AUTH_FILS_SK:
+ case WLAN_AUTH_FILS_SK_PFS:
+ case WLAN_AUTH_FILS_PK:
break;
case WLAN_AUTH_SHARED_KEY:
if (ifmgd->auth_data->expected_transaction != 4) {
@@ -2758,7 +2776,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
bssid, reason_code,
ieee80211_get_reason_code_string(reason_code));
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, true);
cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
return;
@@ -3139,6 +3157,10 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
reassoc ? "Rea" : "A", mgmt->sa,
capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
+ if (assoc_data->fils_kek_len &&
+ fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0)
+ return;
+
pos = mgmt->u.assoc_resp.variable;
ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
@@ -3163,14 +3185,14 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
if (status_code != WLAN_STATUS_SUCCESS) {
sdata_info(sdata, "%pM denied association (code=%d)\n",
mgmt->sa, status_code);
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
event.u.mlme.status = MLME_DENIED;
event.u.mlme.reason = status_code;
drv_event_callback(sdata->local, sdata, &event);
} else {
if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) {
/* oops -- internal error -- send timeout for now */
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
return;
}
@@ -3183,13 +3205,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
* recalc after assoc_data is NULL but before associated
* is set can cause the interface to go idle
*/
- ieee80211_destroy_assoc_data(sdata, true);
+ ieee80211_destroy_assoc_data(sdata, true, false);
/* get uapsd queues configuration */
uapsd_queues = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
if (sdata->tx_conf[ac].uapsd)
- uapsd_queues |= BIT(ac);
+ uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
}
cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues);
@@ -3882,7 +3904,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
.u.mlme.status = MLME_TIMEOUT,
};
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
drv_event_callback(sdata->local, sdata, &event);
}
@@ -4021,7 +4043,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
WLAN_REASON_DEAUTH_LEAVING,
false, frame_buf);
if (ifmgd->assoc_data)
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, true);
if (ifmgd->auth_data)
ieee80211_destroy_auth_data(sdata, false);
cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf,
@@ -4475,24 +4497,36 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
case NL80211_AUTHTYPE_SAE:
auth_alg = WLAN_AUTH_SAE;
break;
+ case NL80211_AUTHTYPE_FILS_SK:
+ auth_alg = WLAN_AUTH_FILS_SK;
+ break;
+ case NL80211_AUTHTYPE_FILS_SK_PFS:
+ auth_alg = WLAN_AUTH_FILS_SK_PFS;
+ break;
+ case NL80211_AUTHTYPE_FILS_PK:
+ auth_alg = WLAN_AUTH_FILS_PK;
+ break;
default:
return -EOPNOTSUPP;
}
- auth_data = kzalloc(sizeof(*auth_data) + req->sae_data_len +
+ auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len +
req->ie_len, GFP_KERNEL);
if (!auth_data)
return -ENOMEM;
auth_data->bss = req->bss;
- if (req->sae_data_len >= 4) {
- __le16 *pos = (__le16 *) req->sae_data;
- auth_data->sae_trans = le16_to_cpu(pos[0]);
- auth_data->sae_status = le16_to_cpu(pos[1]);
- memcpy(auth_data->data, req->sae_data + 4,
- req->sae_data_len - 4);
- auth_data->data_len += req->sae_data_len - 4;
+ if (req->auth_data_len >= 4) {
+ if (req->auth_type == NL80211_AUTHTYPE_SAE) {
+ __le16 *pos = (__le16 *) req->auth_data;
+
+ auth_data->sae_trans = le16_to_cpu(pos[0]);
+ auth_data->sae_status = le16_to_cpu(pos[1]);
+ }
+ memcpy(auth_data->data, req->auth_data + 4,
+ req->auth_data_len - 4);
+ auth_data->data_len += req->auth_data_len - 4;
}
if (req->ie && req->ie_len) {
@@ -4688,6 +4722,21 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
assoc_data->ie_len = req->ie_len;
}
+ if (req->fils_kek) {
+ /* should already be checked in cfg80211 - so warn */
+ if (WARN_ON(req->fils_kek_len > FILS_MAX_KEK_LEN)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ memcpy(assoc_data->fils_kek, req->fils_kek,
+ req->fils_kek_len);
+ assoc_data->fils_kek_len = req->fils_kek_len;
+ }
+
+ if (req->fils_nonces)
+ memcpy(assoc_data->fils_nonces, req->fils_nonces,
+ 2 * FILS_NONCE_LEN);
+
assoc_data->bss = req->bss;
if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) {
@@ -4903,7 +4952,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
IEEE80211_STYPE_DEAUTH,
req->reason_code, tx,
frame_buf);
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, true);
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
req->reason_code);
@@ -4978,7 +5027,7 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
sdata_lock(sdata);
if (ifmgd->assoc_data) {
struct cfg80211_bss *bss = ifmgd->assoc_data->bss;
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
}
if (ifmgd->auth_data)
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 55a9c5b94ce1..eede5c6db8d5 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -128,7 +128,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local)
if (!ieee80211_sdata_running(sdata))
continue;
- if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE)
+ if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
+ sdata->vif.type == NL80211_IFTYPE_NAN)
continue;
if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
@@ -819,7 +820,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT)
break;
rcu_read_lock();
- sta = sta_info_get(sdata, mgmt->da);
+ sta = sta_info_get_bss(sdata, mgmt->da);
rcu_read_unlock();
if (!sta)
return -ENOLINK;
@@ -838,6 +839,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
case NL80211_IFTYPE_P2P_DEVICE:
need_offchan = true;
break;
+ case NL80211_IFTYPE_NAN:
default:
return -EOPNOTSUPP;
}
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 00a43a70e1fc..28a3a0957c9e 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -178,8 +178,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
WARN_ON(!list_empty(&local->chanctx_list));
/* stop hardware - this must stop RX */
- if (local->open_count)
- ieee80211_stop_device(local);
+ ieee80211_stop_device(local);
suspend:
local->suspended = true;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b157908..3090dd4342f6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
len += 12;
}
+ if (local->hw.radiotap_timestamp.units_pos >= 0) {
+ len = ALIGN(len, 8);
+ len += 12;
+ }
+
if (status->chains) {
/* antenna and antenna signal fields */
len += 2 * hweight8(status->chains);
@@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
pos += 2;
}
+ if (local->hw.radiotap_timestamp.units_pos >= 0) {
+ u16 accuracy = 0;
+ u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT;
+
+ rthdr->it_present |=
+ cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP);
+
+ /* ensure 8 byte alignment */
+ while ((pos - (u8 *)rthdr) & 7)
+ pos++;
+
+ put_unaligned_le64(status->device_timestamp, pos);
+ pos += sizeof(u64);
+
+ if (local->hw.radiotap_timestamp.accuracy >= 0) {
+ accuracy = local->hw.radiotap_timestamp.accuracy;
+ flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY;
+ }
+ put_unaligned_le16(accuracy, pos);
+ pos += sizeof(u16);
+
+ *pos++ = local->hw.radiotap_timestamp.units_pos;
+ *pos++ = flags;
+ }
+
for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
*pos++ = status->chain_signal[chain];
*pos++ = chain;
@@ -485,6 +515,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
struct net_device *prev_dev = NULL;
int present_fcs_len = 0;
unsigned int rtap_vendor_space = 0;
+ struct ieee80211_mgmt *mgmt;
+ struct ieee80211_sub_if_data *monitor_sdata =
+ rcu_dereference(local->monitor_sdata);
if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
@@ -567,7 +600,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
continue;
- if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
continue;
if (!ieee80211_sdata_running(sdata))
@@ -585,6 +618,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
ieee80211_rx_stats(sdata->dev, skb->len);
}
+ mgmt = (void *)skb->data;
+ if (monitor_sdata &&
+ skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
+ ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category == WLAN_CATEGORY_VHT &&
+ mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
+ is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
+ ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
+ struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
+
+ if (mu_skb) {
+ mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
+ ieee80211_queue_work(&local->hw, &monitor_sdata->work);
+ }
+ }
+
if (prev_dev) {
skb->dev = prev_dev;
netif_receive_skb(skb);
@@ -1072,8 +1122,15 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
- if (!tid_agg_rx)
+ if (!tid_agg_rx) {
+ if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
+ !test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
+ !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
+ ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
+ WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_REQUIRE_SETUP);
goto dont_reorder;
+ }
/* qos null data frames are excluded */
if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
@@ -1266,9 +1323,7 @@ static void sta_ps_start(struct sta_info *sta)
return;
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
- struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
-
- if (txqi->tin.backlog_packets)
+ if (txq_has_queue(sta->sta.txq[tid]))
set_bit(tid, &sta->txq_buffered_tids);
else
clear_bit(tid, &sta->txq_buffered_tids);
@@ -1339,13 +1394,15 @@ void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
u8 ac = ieee802_1d_to_ac[tid & 7];
/*
- * If this AC is not trigger-enabled do nothing.
+ * If this AC is not trigger-enabled do nothing unless the
+ * driver is calling us after it already checked.
*
* NB: This could/should check a separate bitmap of trigger-
* enabled queues, but for now we only implement uAPSD w/o
* TSPEC changes to the ACs, so they're always the same.
*/
- if (!(sta->sta.uapsd_queues & BIT(ac)))
+ if (!(sta->sta.uapsd_queues & ieee80211_ac_to_qos_mask[ac]) &&
+ tid != IEEE80211_NUM_TIDS)
return;
/* if we are in a service period, do nothing */
@@ -2160,7 +2217,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
- if (is_multicast_ether_addr(ehdr->h_dest)) {
+ if (is_multicast_ether_addr(ehdr->h_dest) &&
+ ieee80211_vif_get_num_mcast_if(sdata) != 0) {
/*
* send multicast frames both to higher layers in
* local net stack and back to the wireless medium
@@ -2169,7 +2227,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
if (!xmit_skb)
net_info_ratelimited("%s: failed to clone multicast frame\n",
dev->name);
- } else {
+ } else if (!is_multicast_ether_addr(ehdr->h_dest)) {
dsta = sta_info_get(sdata, skb->data);
if (dsta) {
/*
@@ -2243,6 +2301,8 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
__le16 fc = hdr->frame_control;
struct sk_buff_head frame_list;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ struct ethhdr ethhdr;
+ const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source;
if (unlikely(!ieee80211_is_data(fc)))
return RX_CONTINUE;
@@ -2253,24 +2313,53 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
if (!(status->rx_flags & IEEE80211_RX_AMSDU))
return RX_CONTINUE;
- if (ieee80211_has_a4(hdr->frame_control) &&
- rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- !rx->sdata->u.vlan.sta)
- return RX_DROP_UNUSABLE;
+ if (unlikely(ieee80211_has_a4(hdr->frame_control))) {
+ switch (rx->sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ if (!rx->sdata->u.vlan.sta)
+ return RX_DROP_UNUSABLE;
+ break;
+ case NL80211_IFTYPE_STATION:
+ if (!rx->sdata->u.mgd.use_4addr)
+ return RX_DROP_UNUSABLE;
+ break;
+ default:
+ return RX_DROP_UNUSABLE;
+ }
+ check_da = NULL;
+ check_sa = NULL;
+ } else switch (rx->sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ check_da = NULL;
+ break;
+ case NL80211_IFTYPE_STATION:
+ if (!rx->sta ||
+ !test_sta_flag(rx->sta, WLAN_STA_TDLS_PEER))
+ check_sa = NULL;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ check_sa = NULL;
+ break;
+ default:
+ break;
+ }
- if (is_multicast_ether_addr(hdr->addr1) &&
- ((rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- rx->sdata->u.vlan.sta) ||
- (rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
- rx->sdata->u.mgd.use_4addr)))
+ if (is_multicast_ether_addr(hdr->addr1))
return RX_DROP_UNUSABLE;
skb->dev = dev;
__skb_queue_head_init(&frame_list);
+ if (ieee80211_data_to_8023_exthdr(skb, &ethhdr,
+ rx->sdata->vif.addr,
+ rx->sdata->vif.type))
+ return RX_DROP_UNUSABLE;
+
ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
rx->sdata->vif.type,
- rx->local->hw.extra_tx_headroom, true);
+ rx->local->hw.extra_tx_headroom,
+ check_da, check_sa);
while (!skb_queue_empty(&frame_list)) {
rx->skb = __skb_dequeue(&frame_list);
@@ -2383,7 +2472,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
if (!ifmsh->mshcfg.dot11MeshForwarding)
goto out;
- fwd_skb = skb_copy(skb, GFP_ATOMIC);
+ fwd_skb = skb_copy_expand(skb, local->tx_headroom +
+ sdata->encrypt_headroom, 0, GFP_ATOMIC);
if (!fwd_skb) {
net_info_ratelimited("%s: failed to clone mesh frame\n",
sdata->name);
@@ -2535,6 +2625,12 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
tid = le16_to_cpu(bar_data.control) >> 12;
+ if (!test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
+ !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
+ ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
+ WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_REQUIRE_SETUP);
+
tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
if (!tid_agg_rx)
return RX_DROP_MONITOR;
@@ -2785,17 +2881,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
switch (mgmt->u.action.u.vht_opmode_notif.action_code) {
case WLAN_VHT_ACTION_OPMODE_NOTIF: {
- u8 opmode;
-
/* verify opmode is present */
if (len < IEEE80211_MIN_ACTION_SIZE + 2)
goto invalid;
-
- opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
-
- ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
- opmode, status->band);
- goto handled;
+ goto queue;
}
case WLAN_VHT_ACTION_GROUPID_MGMT: {
if (len < IEEE80211_MIN_ACTION_SIZE + 25)
@@ -3147,7 +3236,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
continue;
if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
- !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+ !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES))
continue;
if (prev_dev) {
@@ -3523,6 +3612,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
ieee80211_is_probe_req(hdr->frame_control) ||
ieee80211_is_probe_resp(hdr->frame_control) ||
ieee80211_is_beacon(hdr->frame_control);
+ case NL80211_IFTYPE_NAN:
+ /* Currently no frames on NAN interface are allowed */
+ return false;
default:
break;
}
@@ -3844,21 +3936,31 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
u64_stats_update_end(&stats->syncp);
if (fast_rx->internal_forward) {
- struct sta_info *dsta = sta_info_get(rx->sdata, skb->data);
+ struct sk_buff *xmit_skb = NULL;
+ bool multicast = is_multicast_ether_addr(skb->data);
+
+ if (multicast) {
+ xmit_skb = skb_copy(skb, GFP_ATOMIC);
+ } else if (sta_info_get(rx->sdata, skb->data)) {
+ xmit_skb = skb;
+ skb = NULL;
+ }
- if (dsta) {
+ if (xmit_skb) {
/*
* Send to wireless media and increase priority by 256
* to keep the received priority instead of
* reclassifying the frame (see cfg80211_classify8021d).
*/
- skb->priority += 256;
- skb->protocol = htons(ETH_P_802_3);
- skb_reset_network_header(skb);
- skb_reset_mac_header(skb);
- dev_queue_xmit(skb);
- return true;
+ xmit_skb->priority += 256;
+ xmit_skb->protocol = htons(ETH_P_802_3);
+ skb_reset_network_header(xmit_skb);
+ skb_reset_mac_header(xmit_skb);
+ dev_queue_xmit(xmit_skb);
}
+
+ if (!skb)
+ return true;
}
/* deliver to local stack */
@@ -3940,7 +4042,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
- struct rhash_head *tmp;
+ struct rhlist_head *tmp;
int err = 0;
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -3983,13 +4085,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
goto out;
} else if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
- const struct bucket_table *tbl;
prev_sta = NULL;
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+ for_each_sta_info(local, hdr->addr2, sta, tmp) {
if (!prev_sta) {
prev_sta = sta;
continue;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 070b40f15850..23d8ac829279 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -420,7 +420,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw,
{
struct ieee80211_local *local = hw_to_local(hw);
- trace_api_scan_completed(local, info);
+ trace_api_scan_completed(local, info->aborted);
set_bit(SCAN_COMPLETED, &local->scanning);
if (info->aborted)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index aa58df80ede0..50c309094c37 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
- .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
- .hashfn = sta_addr_hash,
.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
};
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
static int sta_info_hash_del(struct ieee80211_local *local,
struct sta_info *sta)
{
- return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhltable_remove(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
sta_info_free(local, sta);
}
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+ const u8 *addr)
+{
+ return rhltable_lookup(&local->sta_hash, addr, sta_rht_params);
+}
+
/* protected by RCU */
struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
rcu_read_lock();
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata) {
rcu_read_unlock();
/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
rcu_read_lock();
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata ||
(sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
rcu_read_unlock();
@@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
static int sta_info_hash_add(struct ieee80211_local *local,
struct sta_info *sta)
{
- return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhltable_insert(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void sta_deliver_ps_frames(struct work_struct *wk)
@@ -340,6 +338,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
memcpy(sta->addr, addr, ETH_ALEN);
memcpy(sta->sta.addr, addr, ETH_ALEN);
+ sta->sta.max_rx_aggregation_subframes =
+ local->hw.max_rx_aggregation_subframes;
+
sta->local = local;
sta->sdata = sdata;
sta->rx_stats.last_rx = jiffies;
@@ -450,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta)
is_multicast_ether_addr(sta->sta.addr)))
return -EINVAL;
- /* Strictly speaking this isn't necessary as we hold the mutex, but
- * the rhashtable code can't really deal with that distinction. We
- * do require the mutex for correctness though.
+ /* The RCU read lock is required by rhashtable due to
+ * asynchronous resize/rehash. We also require the mutex
+ * for correctness.
*/
rcu_read_lock();
lockdep_assert_held(&sdata->local->sta_mtx);
@@ -708,7 +709,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
unsigned long tids;
- if (ignore_for_tim & BIT(ac))
+ if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac])
continue;
indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) ||
@@ -1040,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data)
round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
}
-u32 sta_addr_hash(const void *key, u32 length, u32 seed)
-{
- return jhash(key, ETH_ALEN, seed);
-}
-
int sta_info_init(struct ieee80211_local *local)
{
int err;
- err = rhashtable_init(&local->sta_hash, &sta_rht_params);
+ err = rhltable_init(&local->sta_hash, &sta_rht_params);
if (err)
return err;
@@ -1065,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local)
void sta_info_stop(struct ieee80211_local *local)
{
del_timer_sync(&local->sta_cleanup);
- rhashtable_destroy(&local->sta_hash);
+ rhltable_destroy(&local->sta_hash);
}
@@ -1135,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
const u8 *localaddr)
{
struct ieee80211_local *local = hw_to_local(hw);
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
-
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
/*
* Just return a random station if localaddr is NULL
* ... first in list.
*/
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (localaddr &&
!ether_addr_equal(sta->sdata->vif.addr, localaddr))
continue;
@@ -1209,12 +1202,10 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
if (sta->sta.txq[0]) {
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
-
- if (!txqi->tin.backlog_packets)
+ if (!txq_has_queue(sta->sta.txq[i]))
continue;
- drv_wake_tx_queue(local, txqi);
+ drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
}
}
@@ -1398,7 +1389,7 @@ ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs,
return true;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
- if (ignored_acs & BIT(ac))
+ if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
continue;
if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
@@ -1423,7 +1414,7 @@ ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs,
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
unsigned long tids;
- if (ignored_acs & BIT(ac))
+ if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
continue;
tids = ieee80211_tids_for_ac(ac);
@@ -1491,7 +1482,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
BIT(find_highest_prio_tid(driver_release_tids));
if (skb_queue_empty(&frames) && !driver_release_tids) {
- int tid;
+ int tid, ac;
/*
* For PS-Poll, this can only happen due to a race condition
@@ -1509,7 +1500,10 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
*/
/* This will evaluate to 1, 3, 5 or 7. */
- tid = 7 - ((ffs(~ignored_acs) - 1) << 1);
+ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++)
+ if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac]))
+ break;
+ tid = 7 - 2 * ac;
ieee80211_send_null_response(sta, tid, reason, true, false);
} else if (!driver_release_tids) {
@@ -1645,10 +1639,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
return;
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
- struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
-
if (!(driver_release_tids & BIT(tid)) ||
- txqi->tin.backlog_packets)
+ txq_has_queue(sta->sta.txq[tid]))
continue;
sta_info_recalc_tim(sta);
@@ -1882,10 +1874,7 @@ int sta_info_move_state(struct sta_info *sta,
if (!sta->sta.support_p2p_ps)
ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
} else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
- (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- !sta->sdata->u.vlan.sta))
- atomic_dec(&sta->sdata->bss->num_mcast_sta);
+ ieee80211_vif_dec_num_mcast(sta->sdata);
clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
ieee80211_clear_fast_xmit(sta);
ieee80211_clear_fast_rx(sta);
@@ -1893,10 +1882,7 @@ int sta_info_move_state(struct sta_info *sta,
break;
case IEEE80211_STA_AUTHORIZED:
if (sta->sta_state == IEEE80211_STA_ASSOC) {
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
- (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- !sta->sdata->u.vlan.sta))
- atomic_inc(&sta->sdata->bss->num_mcast_sta);
+ ieee80211_vif_inc_num_mcast(sta->sdata);
set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
ieee80211_check_fast_xmit(sta);
ieee80211_check_fast_rx(sta);
@@ -1986,6 +1972,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
u16 brate;
unsigned int shift;
+ rinfo->flags = 0;
sband = local->hw.wiphy->bands[(rate >> 4) & 0xf];
brate = sband->bitrates[rate & 0xf].bitrate;
if (rinfo->bw == RATE_INFO_BW_5)
@@ -2001,14 +1988,15 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
}
-static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
{
u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate);
if (rate == STA_STATS_RATE_INVALID)
- rinfo->flags = 0;
- else
- sta_stats_decode_rate(sta->local, rate, rinfo);
+ return -EINVAL;
+
+ sta_stats_decode_rate(sta->local, rate, rinfo);
+ return 0;
}
static void sta_set_tidstats(struct sta_info *sta,
@@ -2213,8 +2201,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) {
- sta_set_rate_info_rx(sta, &sinfo->rxrate);
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
+ if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
+ sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
}
sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
@@ -2279,11 +2267,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER))
sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER);
- /* check if the driver has a SW RC implementation */
- if (ref && ref->ops->get_expected_throughput)
- thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv);
- else
- thr = drv_get_expected_throughput(local, &sta->sta);
+ thr = sta_get_expected_throughput(sta);
if (thr != 0) {
sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
@@ -2291,6 +2275,25 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
}
}
+u32 sta_get_expected_throughput(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct rate_control_ref *ref = NULL;
+ u32 thr = 0;
+
+ if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
+ ref = local->rate_ctrl;
+
+ /* check if the driver has a SW RC implementation */
+ if (ref && ref->ops->get_expected_throughput)
+ thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv);
+ else
+ thr = drv_get_expected_throughput(local, sta);
+
+ return thr;
+}
+
unsigned long ieee80211_sta_last_active(struct sta_info *sta)
{
struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 78b0ef32dddd..dd06ef0b8861 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -184,7 +184,6 @@ struct tid_ampdu_tx {
* @ssn: Starting Sequence Number expected to be aggregated.
* @buf_size: buffer size for incoming A-MPDUs
* @timeout: reset timer value (in TUs).
- * @dialog_token: dialog token for aggregation session
* @rcu_head: RCU head used for freeing this struct
* @reorder_lock: serializes access to reorder buffer, see below.
* @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
@@ -213,7 +212,6 @@ struct tid_ampdu_rx {
u16 ssn;
u16 buf_size;
u16 timeout;
- u8 dialog_token;
bool auto_seq;
bool removed;
};
@@ -225,11 +223,14 @@ struct tid_ampdu_rx {
* to tid_tx[idx], which are protected by the sta spinlock)
* tid_start_tx is also protected by sta->lock.
* @tid_rx: aggregation info for Rx per TID -- RCU protected
+ * @tid_rx_token: dialog tokens for valid aggregation sessions
* @tid_rx_timer_expired: bitmap indicating on which TIDs the
* RX timer expired until the work for it runs
* @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
* driver requested to close until the work for it runs
* @agg_session_valid: bitmap indicating which TID has a rx BA session open on
+ * @unexpected_agg: bitmap indicating which TID already sent a delBA due to
+ * unexpected aggregation related frames outside a session
* @work: work struct for starting/stopping aggregation
* @tid_tx: aggregation info for Tx per TID
* @tid_start_tx: sessions where start was requested
@@ -241,9 +242,11 @@ struct sta_ampdu_mlme {
struct mutex mtx;
/* rx */
struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
+ u8 tid_rx_token[IEEE80211_NUM_TIDS];
unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+ unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
/* tx */
struct work_struct work;
struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
@@ -452,7 +455,7 @@ struct sta_info {
/* General information, mostly static */
struct list_head list, free_list;
struct rcu_head rcu_head;
- struct rhash_head hash_node;
+ struct rhlist_head hash_node;
u8 addr[ETH_ALEN];
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
@@ -635,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
*/
#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+ const u8 *addr);
+
/*
* Get a STA info, must be under RCU read lock.
*/
@@ -644,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr);
-u32 sta_addr_hash(const void *key, u32 length, u32 seed);
-
-#define _sta_bucket_idx(_tbl, _a) \
- rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd))
-
-#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \
- rht_for_each_entry_rcu(_sta, _tmp, tbl, \
- _sta_bucket_idx(tbl, _addr), \
- hash_node) \
- /* compare address and run code only if it matches */ \
- if (ether_addr_equal(_sta->addr, (_addr)))
+#define for_each_sta_info(local, _addr, _sta, _tmp) \
+ rhl_for_each_entry_rcu(_sta, _tmp, \
+ sta_info_hash_lookup(local, _addr), hash_node)
/*
* Get STA info by index, BROKEN!
@@ -712,6 +710,8 @@ void sta_set_rate_info_tx(struct sta_info *sta,
struct rate_info *rinfo);
void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo);
+u32 sta_get_expected_throughput(struct sta_info *sta);
+
void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
unsigned long exp_time);
u8 sta_info_tx_streams(struct sta_info *sta);
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index a2a68269675d..ddf71c648cab 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
static void ieee80211_lost_packet(struct sta_info *sta,
struct ieee80211_tx_info *info)
{
+ /* If driver relies on its own algorithm for station kickout, skip
+ * mac80211 packet loss mechanism.
+ */
+ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK))
+ return;
+
/* This packet was aggregated but doesn't carry status info */
if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
!(info->flags & IEEE80211_TX_STAT_AMPDU))
@@ -709,7 +715,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
if (!ieee80211_sdata_running(sdata))
continue;
- if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
+ if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) &&
!send_to_cooked)
continue;
@@ -740,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
__le16 fc;
struct ieee80211_supported_band *sband;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
int retry_count;
int rates_idx;
bool send_to_cooked;
@@ -749,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
- const struct bucket_table *tbl;
rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
@@ -758,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
sband = local->hw.wiphy->bands[info->band];
fc = hdr->frame_control;
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) {
+ for_each_sta_info(local, hdr->addr1, sta, tmp) {
/* skip wrong virtual interface */
if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
continue;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 77e4c53baefb..92a47afaa989 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -984,6 +984,32 @@ TRACE_EVENT(drv_set_tsf,
)
);
+TRACE_EVENT(drv_offset_tsf,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ s64 offset),
+
+ TP_ARGS(local, sdata, offset),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(s64, tsf_offset)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->tsf_offset = offset;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " tsf offset:%lld",
+ LOCAL_PR_ARG, VIF_PR_ARG,
+ (unsigned long long)__entry->tsf_offset
+ )
+);
+
DEFINE_EVENT(local_sdata_evt, drv_reset_tsf,
TP_PROTO(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata),
@@ -1700,6 +1726,139 @@ TRACE_EVENT(drv_get_expected_throughput,
)
);
+TRACE_EVENT(drv_start_nan,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_nan_conf *conf),
+
+ TP_ARGS(local, sdata, conf),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, master_pref)
+ __field(u8, dual)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->master_pref = conf->master_pref;
+ __entry->dual = conf->dual;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT
+ ", master preference: %u, dual: %d",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
+ __entry->dual
+ )
+);
+
+TRACE_EVENT(drv_stop_nan,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+
+ TP_ARGS(local, sdata),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_nan_change_conf,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_nan_conf *conf,
+ u32 changes),
+
+ TP_ARGS(local, sdata, conf, changes),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, master_pref)
+ __field(u8, dual)
+ __field(u32, changes)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->master_pref = conf->master_pref;
+ __entry->dual = conf->dual;
+ __entry->changes = changes;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT
+ ", master preference: %u, dual: %d, changes: 0x%x",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
+ __entry->dual, __entry->changes
+ )
+);
+
+TRACE_EVENT(drv_add_nan_func,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_nan_func *func),
+
+ TP_ARGS(local, sdata, func),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, type)
+ __field(u8, inst_id)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->type = func->type;
+ __entry->inst_id = func->instance_id;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT
+ ", type: %u, inst_id: %u",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id
+ )
+);
+
+TRACE_EVENT(drv_del_nan_func,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u8 instance_id),
+
+ TP_ARGS(local, sdata, instance_id),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, instance_id)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->instance_id = instance_id;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT
+ ", instance_id: %u",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id
+ )
+);
+
/*
* Tracing for API calls that drivers call.
*/
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 18b285e06bc8..797e847cbc49 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -331,9 +331,8 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc);
return TX_DROP;
}
- } else if (unlikely(tx->sdata->vif.type == NL80211_IFTYPE_AP &&
- ieee80211_is_data(hdr->frame_control) &&
- !atomic_read(&tx->sdata->u.ap.num_mcast_sta))) {
+ } else if (unlikely(ieee80211_is_data(hdr->frame_control) &&
+ ieee80211_vif_get_num_mcast_if(tx->sdata) == 0)) {
/*
* No associated STAs - no need to send multicast
* frames.
@@ -796,36 +795,6 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
return ret;
}
-static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
- struct ieee80211_vif *vif,
- struct ieee80211_sta *pubsta,
- struct sk_buff *skb)
-{
- struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ieee80211_txq *txq = NULL;
-
- if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
- (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
- return NULL;
-
- if (!ieee80211_is_data(hdr->frame_control))
- return NULL;
-
- if (pubsta) {
- u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
-
- txq = pubsta->txq[tid];
- } else if (vif) {
- txq = vif->txq;
- }
-
- if (!txq)
- return NULL;
-
- return to_txq_info(txq);
-}
-
static ieee80211_tx_result debug_noinline
ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
{
@@ -883,9 +852,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
tx->sta->tx_stats.msdu[tid]++;
- if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta,
- tx->skb))
- hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+ hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
return TX_CONTINUE;
}
@@ -967,7 +934,7 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
if (info->flags & IEEE80211_TX_CTL_DONTFRAG)
return TX_CONTINUE;
- if (tx->local->ops->set_frag_threshold)
+ if (ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG))
return TX_CONTINUE;
/*
@@ -1274,6 +1241,39 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
return TX_CONTINUE;
}
+static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
+ struct ieee80211_vif *vif,
+ struct sta_info *sta,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_txq *txq = NULL;
+
+ if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
+ (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
+ return NULL;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return NULL;
+
+ if (sta) {
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+
+ if (!sta->uploaded)
+ return NULL;
+
+ txq = sta->sta.txq[tid];
+ } else if (vif) {
+ txq = vif->txq;
+ }
+
+ if (!txq)
+ return NULL;
+
+ return to_txq_info(txq);
+}
+
static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
{
IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
@@ -1344,7 +1344,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
local = container_of(fq, struct ieee80211_local, fq);
txqi = container_of(tin, struct txq_info, tin);
cparams = &local->cparams;
- cstats = &local->cstats;
+ cstats = &txqi->cstats;
if (flow == &txqi->def_flow)
cvars = &txqi->def_cvars;
@@ -1404,6 +1404,8 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
fq_tin_init(&txqi->tin);
fq_flow_init(&txqi->def_flow);
codel_vars_init(&txqi->def_cvars);
+ codel_stats_init(&txqi->cstats);
+ __skb_queue_head_init(&txqi->frags);
txqi->txq.vif = &sdata->vif;
@@ -1426,6 +1428,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
struct fq_tin *tin = &txqi->tin;
fq_tin_reset(fq, tin, fq_skb_free_func);
+ ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
}
int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1433,6 +1436,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
struct fq *fq = &local->fq;
int ret;
int i;
+ bool supp_vht = false;
+ enum nl80211_band band;
if (!local->ops->wake_tx_queue)
return 0;
@@ -1441,8 +1446,24 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
if (ret)
return ret;
+ /*
+ * If the hardware doesn't support VHT, it is safe to limit the maximum
+ * queue size. 4 Mbytes is 64 max-size aggregates in 802.11n.
+ */
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+
+ sband = local->hw.wiphy->bands[band];
+ if (!sband)
+ continue;
+
+ supp_vht = supp_vht || sband->vht_cap.vht_supported;
+ }
+
+ if (!supp_vht)
+ fq->memory_limit = 4 << 20; /* 4 Mbytes */
+
codel_params_init(&local->cparams);
- codel_stats_init(&local->cstats);
local->cparams.interval = MS2TIME(100);
local->cparams.target = MS2TIME(20);
local->cparams.ecn = true;
@@ -1477,54 +1498,37 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
spin_unlock_bh(&fq->lock);
}
-struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
- struct ieee80211_txq *txq)
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct sk_buff *skb)
{
- struct ieee80211_local *local = hw_to_local(hw);
- struct txq_info *txqi = container_of(txq, struct txq_info, txq);
- struct ieee80211_hdr *hdr;
- struct sk_buff *skb = NULL;
struct fq *fq = &local->fq;
- struct fq_tin *tin = &txqi->tin;
-
- spin_lock_bh(&fq->lock);
-
- if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
- goto out;
+ struct ieee80211_vif *vif;
+ struct txq_info *txqi;
- skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
- if (!skb)
- goto out;
+ if (!local->ops->wake_tx_queue ||
+ sdata->vif.type == NL80211_IFTYPE_MONITOR)
+ return false;
- ieee80211_set_skb_vif(skb, txqi);
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
- hdr = (struct ieee80211_hdr *)skb->data;
- if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
- struct sta_info *sta = container_of(txq->sta, struct sta_info,
- sta);
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ vif = &sdata->vif;
+ txqi = ieee80211_get_txq(local, vif, sta, skb);
- hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
- if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
- info->flags |= IEEE80211_TX_CTL_AMPDU;
- else
- info->flags &= ~IEEE80211_TX_CTL_AMPDU;
- }
+ if (!txqi)
+ return false;
-out:
+ spin_lock_bh(&fq->lock);
+ ieee80211_txq_enqueue(local, txqi, skb);
spin_unlock_bh(&fq->lock);
- if (skb && skb_has_frag_list(skb) &&
- !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
- if (skb_linearize(skb)) {
- ieee80211_free_txskb(&local->hw, skb);
- return NULL;
- }
- }
+ drv_wake_tx_queue(local, txqi);
- return skb;
+ return true;
}
-EXPORT_SYMBOL(ieee80211_tx_dequeue);
static bool ieee80211_tx_frags(struct ieee80211_local *local,
struct ieee80211_vif *vif,
@@ -1533,9 +1537,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
bool txpending)
{
struct ieee80211_tx_control control = {};
- struct fq *fq = &local->fq;
struct sk_buff *skb, *tmp;
- struct txq_info *txqi;
unsigned long flags;
skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1550,21 +1552,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
}
#endif
- txqi = ieee80211_get_txq(local, vif, sta, skb);
- if (txqi) {
- info->control.vif = vif;
-
- __skb_unlink(skb, skbs);
-
- spin_lock_bh(&fq->lock);
- ieee80211_txq_enqueue(local, txqi, skb);
- spin_unlock_bh(&fq->lock);
-
- drv_wake_tx_queue(local, txqi);
-
- continue;
- }
-
spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
if (local->queue_stop_reasons[q] ||
(!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1648,7 +1635,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
switch (sdata->vif.type) {
case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
vif = &sdata->vif;
break;
}
@@ -1685,10 +1672,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
/*
* Invoke TX handlers, return 0 on success and non-zero if the
* frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
*/
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
{
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
ieee80211_tx_result res = TX_DROP;
#define CALL_TXH(txh) \
@@ -1706,6 +1696,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
CALL_TXH(ieee80211_tx_h_rate_ctrl);
+ txh_done:
+ if (unlikely(res == TX_DROP)) {
+ I802_DEBUG_INC(tx->local->tx_handlers_drop);
+ if (tx->skb)
+ ieee80211_free_txskb(&tx->local->hw, tx->skb);
+ else
+ ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+ return -1;
+ } else if (unlikely(res == TX_QUEUED)) {
+ I802_DEBUG_INC(tx->local->tx_handlers_queued);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ ieee80211_tx_result res = TX_CONTINUE;
+
if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
__skb_queue_tail(&tx->skbs, tx->skb);
tx->skb = NULL;
@@ -1738,6 +1753,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
return 0;
}
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+ int r = invoke_tx_handlers_early(tx);
+
+ if (r)
+ return r;
+ return invoke_tx_handlers_late(tx);
+}
+
bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
struct ieee80211_vif *vif, struct sk_buff *skb,
int band, struct ieee80211_sta **sta)
@@ -1812,7 +1836,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
info->hw_queue =
sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
- if (!invoke_tx_handlers(&tx))
+ if (invoke_tx_handlers_early(&tx))
+ return false;
+
+ if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+ return true;
+
+ if (!invoke_tx_handlers_late(&tx))
result = __ieee80211_tx(local, &tx.skbs, led_len,
tx.sta, txpending);
@@ -2268,15 +2298,9 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_STATION:
if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) {
sta = sta_info_get(sdata, skb->data);
- if (sta) {
- bool tdls_peer, tdls_auth;
-
- tdls_peer = test_sta_flag(sta,
- WLAN_STA_TDLS_PEER);
- tdls_auth = test_sta_flag(sta,
- WLAN_STA_TDLS_PEER_AUTH);
-
- if (tdls_peer && tdls_auth) {
+ if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ if (test_sta_flag(sta,
+ WLAN_STA_TDLS_PEER_AUTH)) {
*sta_out = sta;
return 0;
}
@@ -2288,8 +2312,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
* after a TDLS sta is removed due to being
* unreachable.
*/
- if (tdls_peer && !tdls_auth &&
- !ieee80211_is_tdls_setup(skb))
+ if (!ieee80211_is_tdls_setup(skb))
return -EINVAL;
}
@@ -2339,7 +2362,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL;
const u8 *encaps_data;
int encaps_len, skip_header_bytes;
- int nh_pos, h_pos;
bool wme_sta = false, authorized = false;
bool tdls_peer;
bool multicast;
@@ -2645,13 +2667,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
encaps_len = 0;
}
- nh_pos = skb_network_header(skb) - skb->data;
- h_pos = skb_transport_header(skb) - skb->data;
-
skb_pull(skb, skip_header_bytes);
- nh_pos -= skip_header_bytes;
- h_pos -= skip_header_bytes;
-
head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb);
/*
@@ -2677,18 +2693,12 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
}
}
- if (encaps_data) {
+ if (encaps_data)
memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
- nh_pos += encaps_len;
- h_pos += encaps_len;
- }
#ifdef CONFIG_MAC80211_MESH
- if (meshhdrlen > 0) {
+ if (meshhdrlen > 0)
memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen);
- nh_pos += meshhdrlen;
- h_pos += meshhdrlen;
- }
#endif
if (ieee80211_is_data_qos(fc)) {
@@ -2704,15 +2714,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
} else
memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
- nh_pos += hdrlen;
- h_pos += hdrlen;
-
- /* Update skb pointers to various headers since this modified frame
- * is going to go through Linux networking code that may potentially
- * need things like pointer to IP header. */
skb_reset_mac_header(skb);
- skb_set_network_header(skb, nh_pos);
- skb_set_transport_header(skb, h_pos);
info = IEEE80211_SKB_CB(skb);
memset(info, 0, sizeof(*info));
@@ -2792,7 +2794,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
/* fast-xmit doesn't handle fragmentation at all */
if (local->hw.wiphy->frag_threshold != (u32)-1 &&
- !local->ops->set_frag_threshold)
+ !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG))
goto out;
rcu_read_lock();
@@ -3051,11 +3053,12 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct ieee80211_hdr *hdr;
- struct ethhdr amsdu_hdr;
+ struct ethhdr *amsdu_hdr;
int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
int subframe_len = skb->len - hdr_len;
void *data;
- u8 *qc;
+ u8 *qc, *h_80211_src, *h_80211_dst;
+ const u8 *bssid;
if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
return false;
@@ -3063,19 +3066,44 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
return true;
- if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr),
+ if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr),
&subframe_len))
return false;
- amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
- memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
- memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+ data = skb_push(skb, sizeof(*amsdu_hdr));
+ memmove(data, data + sizeof(*amsdu_hdr), hdr_len);
+ hdr = data;
+ amsdu_hdr = data + hdr_len;
+ /* h_80211_src/dst is addr* field within hdr */
+ h_80211_src = data + fast_tx->sa_offs;
+ h_80211_dst = data + fast_tx->da_offs;
+
+ amsdu_hdr->h_proto = cpu_to_be16(subframe_len);
+ ether_addr_copy(amsdu_hdr->h_source, h_80211_src);
+ ether_addr_copy(amsdu_hdr->h_dest, h_80211_dst);
+
+ /* according to IEEE 802.11-2012 8.3.2 table 8-19, the outer SA/DA
+ * fields needs to be changed to BSSID for A-MSDU frames depending
+ * on FromDS/ToDS values.
+ */
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ bssid = sdata->u.mgd.bssid;
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ bssid = sdata->vif.addr;
+ break;
+ default:
+ bssid = NULL;
+ }
- data = skb_push(skb, sizeof(amsdu_hdr));
- memmove(data, data + sizeof(amsdu_hdr), hdr_len);
- memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+ if (bssid && ieee80211_has_fromds(hdr->frame_control))
+ ether_addr_copy(h_80211_src, bssid);
+
+ if (bssid && ieee80211_has_tods(hdr->frame_control))
+ ether_addr_copy(h_80211_dst, bssid);
- hdr = data;
qc = ieee80211_get_qos_ctl(hdr);
*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
@@ -3184,8 +3212,70 @@ out:
return ret;
}
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, u8 pn_offs,
+ struct ieee80211_key *key,
+ struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ u8 tid = IEEE80211_NUM_TIDS;
+
+ if (key)
+ info->control.hw_key = &key->conf;
+
+ ieee80211_tx_stats(skb->dev, skb->len);
+
+ if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+ } else {
+ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+ hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+ sdata->sequence_number += 0x10;
+ }
+
+ if (skb_shinfo(skb)->gso_size)
+ sta->tx_stats.msdu[tid] +=
+ DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+ else
+ sta->tx_stats.msdu[tid]++;
+
+ info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+ /* statistics normally done by ieee80211_tx_h_stats (but that
+ * has to consider fragmentation, so is more complex)
+ */
+ sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+ sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
+
+ if (pn_offs) {
+ u64 pn;
+ u8 *crypto_hdr = skb->data + pn_offs;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ pn = atomic64_inc_return(&key->conf.tx_pn);
+ crypto_hdr[0] = pn;
+ crypto_hdr[1] = pn >> 8;
+ crypto_hdr[4] = pn >> 16;
+ crypto_hdr[5] = pn >> 24;
+ crypto_hdr[6] = pn >> 32;
+ crypto_hdr[7] = pn >> 40;
+ break;
+ }
+ }
+}
+
static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
- struct net_device *dev, struct sta_info *sta,
+ struct sta_info *sta,
struct ieee80211_fast_tx *fast_tx,
struct sk_buff *skb)
{
@@ -3194,7 +3284,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
int hw_headroom = sdata->local->hw.extra_tx_headroom;
struct ethhdr eth;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info *info;
struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
struct ieee80211_tx_data tx;
ieee80211_tx_result r;
@@ -3236,8 +3326,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
return true;
}
- ieee80211_tx_stats(dev, skb->len + extra_head);
-
if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
return true;
@@ -3260,31 +3348,20 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN);
memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN);
+ info = IEEE80211_SKB_CB(skb);
memset(info, 0, sizeof(*info));
info->band = fast_tx->band;
info->control.vif = &sdata->vif;
info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
IEEE80211_TX_CTL_DONTFRAG |
(tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+ info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
*ieee80211_get_qos_ctl(hdr) = tid;
- if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb))
- hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
- } else {
- info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
- hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
- sdata->sequence_number += 0x10;
}
- if (skb_shinfo(skb)->gso_size)
- sta->tx_stats.msdu[tid] +=
- DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
- else
- sta->tx_stats.msdu[tid]++;
-
- info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
-
__skb_queue_head_init(&tx.skbs);
tx.flags = IEEE80211_TX_UNICAST;
@@ -3293,9 +3370,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
tx.sta = sta;
tx.key = fast_tx->key;
- if (fast_tx->key)
- info->control.hw_key = &fast_tx->key->conf;
-
if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
tx.skb = skb;
r = ieee80211_tx_h_rate_ctrl(&tx);
@@ -3309,31 +3383,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
}
}
- /* statistics normally done by ieee80211_tx_h_stats (but that
- * has to consider fragmentation, so is more complex)
- */
- sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
- sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
-
- if (fast_tx->pn_offs) {
- u64 pn;
- u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
+ if (ieee80211_queue_skb(local, sdata, sta, skb))
+ return true;
- switch (fast_tx->key->conf.cipher) {
- case WLAN_CIPHER_SUITE_CCMP:
- case WLAN_CIPHER_SUITE_CCMP_256:
- case WLAN_CIPHER_SUITE_GCMP:
- case WLAN_CIPHER_SUITE_GCMP_256:
- pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
- crypto_hdr[0] = pn;
- crypto_hdr[1] = pn >> 8;
- crypto_hdr[4] = pn >> 16;
- crypto_hdr[5] = pn >> 24;
- crypto_hdr[6] = pn >> 32;
- crypto_hdr[7] = pn >> 40;
- break;
- }
- }
+ ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
+ fast_tx->key, skb);
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(sdata->bss,
@@ -3344,6 +3398,99 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
return true;
}
+struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
+ struct ieee80211_txq *txq)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct txq_info *txqi = container_of(txq, struct txq_info, txq);
+ struct ieee80211_hdr *hdr;
+ struct sk_buff *skb = NULL;
+ struct fq *fq = &local->fq;
+ struct fq_tin *tin = &txqi->tin;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_tx_data tx;
+ ieee80211_tx_result r;
+
+ spin_lock_bh(&fq->lock);
+
+ if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
+ goto out;
+
+ /* Make sure fragments stay together. */
+ skb = __skb_dequeue(&txqi->frags);
+ if (skb)
+ goto out;
+
+begin:
+ skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
+ if (!skb)
+ goto out;
+
+ ieee80211_set_skb_vif(skb, txqi);
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ info = IEEE80211_SKB_CB(skb);
+
+ memset(&tx, 0, sizeof(tx));
+ __skb_queue_head_init(&tx.skbs);
+ tx.local = local;
+ tx.skb = skb;
+ tx.sdata = vif_to_sdata(info->control.vif);
+
+ if (txq->sta)
+ tx.sta = container_of(txq->sta, struct sta_info, sta);
+
+ /*
+ * The key can be removed while the packet was queued, so need to call
+ * this here to get the current key.
+ */
+ r = ieee80211_tx_h_select_key(&tx);
+ if (r != TX_CONTINUE) {
+ ieee80211_free_txskb(&local->hw, skb);
+ goto begin;
+ }
+
+ if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ else
+ info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+
+ if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
+ struct sta_info *sta = container_of(txq->sta, struct sta_info,
+ sta);
+ u8 pn_offs = 0;
+
+ if (tx.key &&
+ (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+ pn_offs = ieee80211_hdrlen(hdr->frame_control);
+
+ ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
+ tx.key, skb);
+ } else {
+ if (invoke_tx_handlers_late(&tx))
+ goto begin;
+
+ skb = __skb_dequeue(&tx.skbs);
+
+ if (!skb_queue_empty(&tx.skbs))
+ skb_queue_splice_tail(&tx.skbs, &txqi->frags);
+ }
+
+ if (skb && skb_has_frag_list(skb) &&
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
+ if (skb_linearize(skb)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ goto begin;
+ }
+ }
+
+out:
+ spin_unlock_bh(&fq->lock);
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_tx_dequeue);
+
void __ieee80211_subif_start_xmit(struct sk_buff *skb,
struct net_device *dev,
u32 info_flags)
@@ -3368,7 +3515,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
fast_tx = rcu_dereference(sta->fast_tx);
if (fast_tx &&
- ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+ ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
goto out;
}
@@ -4395,9 +4542,6 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
int ac = ieee802_1d_to_ac[tid & 7];
skb_reset_mac_header(skb);
- skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
-
skb_set_queue_mapping(skb, ac);
skb->priority = tid;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 42bf0b6685e8..ac59fbd280df 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -598,7 +598,7 @@ static void __iterate_interfaces(struct ieee80211_local *local,
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
switch (sdata->vif.type) {
case NL80211_IFTYPE_MONITOR:
- if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
continue;
break;
case NL80211_IFTYPE_AP_VLAN:
@@ -1209,7 +1209,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
}
if (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
- sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) {
+ sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_NAN) {
sdata->vif.bss_conf.qos = enable_qos;
if (bss_notify)
ieee80211_bss_info_change_notify(sdata,
@@ -1748,6 +1749,46 @@ static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata)
mutex_unlock(&local->sta_mtx);
}
+static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata)
+{
+ struct cfg80211_nan_func *func, **funcs;
+ int res, id, i = 0;
+
+ res = drv_start_nan(sdata->local, sdata,
+ &sdata->u.nan.conf);
+ if (WARN_ON(res))
+ return res;
+
+ funcs = kzalloc((sdata->local->hw.max_nan_de_entries + 1) *
+ sizeof(*funcs), GFP_KERNEL);
+ if (!funcs)
+ return -ENOMEM;
+
+ /* Add all the functions:
+ * This is a little bit ugly. We need to call a potentially sleeping
+ * callback for each NAN function, so we can't hold the spinlock.
+ */
+ spin_lock_bh(&sdata->u.nan.func_lock);
+
+ idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id)
+ funcs[i++] = func;
+
+ spin_unlock_bh(&sdata->u.nan.func_lock);
+
+ for (i = 0; funcs[i]; i++) {
+ res = drv_add_nan_func(sdata->local, sdata, funcs[i]);
+ if (WARN_ON(res))
+ ieee80211_nan_func_terminated(&sdata->vif,
+ funcs[i]->instance_id,
+ NL80211_NAN_FUNC_TERM_REASON_ERROR,
+ GFP_KERNEL);
+ }
+
+ kfree(funcs);
+
+ return 0;
+}
+
int ieee80211_reconfig(struct ieee80211_local *local)
{
struct ieee80211_hw *hw = &local->hw;
@@ -1971,6 +2012,13 @@ int ieee80211_reconfig(struct ieee80211_local *local)
ieee80211_bss_info_change_notify(sdata, changed);
}
break;
+ case NL80211_IFTYPE_NAN:
+ res = ieee80211_reconfig_nan(sdata);
+ if (res < 0) {
+ ieee80211_handle_reconfig_failure(local);
+ return res;
+ }
+ break;
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_MONITOR:
@@ -2555,7 +2603,6 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
if (need_basic && basic_rates & BIT(i))
basic = 0x80;
- rate = sband->bitrates[i].bitrate;
rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
5 * (1 << shift));
*pos++ = basic | (u8) rate;
@@ -3261,10 +3308,11 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_sub_if_data *sdata_iter;
enum nl80211_iftype iftype = sdata->wdev.iftype;
- int num[NUM_NL80211_IFTYPES];
struct ieee80211_chanctx *ctx;
- int num_different_channels = 0;
int total = 1;
+ struct iface_combination_params params = {
+ .radar_detect = radar_detect,
+ };
lockdep_assert_held(&local->chanctx_mtx);
@@ -3275,12 +3323,19 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
!chandef->chan))
return -EINVAL;
- if (chandef)
- num_different_channels = 1;
-
if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
return -EINVAL;
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
+ /*
+ * always passing this is harmless, since it'll be the
+ * same value that cfg80211 finds if it finds the same
+ * interface ... and that's always allowed
+ */
+ params.new_beacon_int = sdata->vif.bss_conf.beacon_int;
+ }
+
/* Always allow software iftypes */
if (local->hw.wiphy->software_iftypes & BIT(iftype)) {
if (radar_detect)
@@ -3288,24 +3343,26 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
return 0;
}
- memset(num, 0, sizeof(num));
+ if (chandef)
+ params.num_different_channels = 1;
if (iftype != NL80211_IFTYPE_UNSPECIFIED)
- num[iftype] = 1;
+ params.iftype_num[iftype] = 1;
list_for_each_entry(ctx, &local->chanctx_list, list) {
if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
continue;
- radar_detect |= ieee80211_chanctx_radar_detect(local, ctx);
+ params.radar_detect |=
+ ieee80211_chanctx_radar_detect(local, ctx);
if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) {
- num_different_channels++;
+ params.num_different_channels++;
continue;
}
if (chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
cfg80211_chandef_compatible(chandef,
&ctx->conf.def))
continue;
- num_different_channels++;
+ params.num_different_channels++;
}
list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) {
@@ -3318,16 +3375,14 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype))
continue;
- num[wdev_iter->iftype]++;
+ params.iftype_num[wdev_iter->iftype]++;
total++;
}
- if (total == 1 && !radar_detect)
+ if (total == 1 && !params.radar_detect)
return 0;
- return cfg80211_check_combinations(local->hw.wiphy,
- num_different_channels,
- radar_detect, num);
+ return cfg80211_check_combinations(local->hw.wiphy, &params);
}
static void
@@ -3343,12 +3398,10 @@ ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c,
int ieee80211_max_num_channels(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
- int num[NUM_NL80211_IFTYPES] = {};
struct ieee80211_chanctx *ctx;
- int num_different_channels = 0;
- u8 radar_detect = 0;
u32 max_num_different_channels = 1;
int err;
+ struct iface_combination_params params = {0};
lockdep_assert_held(&local->chanctx_mtx);
@@ -3356,17 +3409,17 @@ int ieee80211_max_num_channels(struct ieee80211_local *local)
if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
continue;
- num_different_channels++;
+ params.num_different_channels++;
- radar_detect |= ieee80211_chanctx_radar_detect(local, ctx);
+ params.radar_detect |=
+ ieee80211_chanctx_radar_detect(local, ctx);
}
list_for_each_entry_rcu(sdata, &local->interfaces, list)
- num[sdata->wdev.iftype]++;
+ params.iftype_num[sdata->wdev.iftype]++;
- err = cfg80211_iter_combinations(local->hw.wiphy,
- num_different_channels, radar_detect,
- num, ieee80211_iter_max_chans,
+ err = cfg80211_iter_combinations(local->hw.wiphy, &params,
+ ieee80211_iter_max_chans,
&max_num_different_channels);
if (err < 0)
return err;
@@ -3394,11 +3447,25 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
unsigned long *byte_cnt)
{
struct txq_info *txqi = to_txq_info(txq);
+ u32 frag_cnt = 0, frag_bytes = 0;
+ struct sk_buff *skb;
+
+ skb_queue_walk(&txqi->frags, skb) {
+ frag_cnt++;
+ frag_bytes += skb->len;
+ }
if (frame_cnt)
- *frame_cnt = txqi->tin.backlog_packets;
+ *frame_cnt = txqi->tin.backlog_packets + frag_cnt;
if (byte_cnt)
- *byte_cnt = txqi->tin.backlog_bytes;
+ *byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
}
EXPORT_SYMBOL(ieee80211_txq_get_depth);
+
+const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = {
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_VO,
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_VI,
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_BE,
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_BK
+};
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index ee715764a828..43e45bb660bc 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -270,6 +270,22 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2);
}
+ /*
+ * This is a workaround for VHT-enabled STAs which break the spec
+ * and have the VHT-MCS Rx map filled in with value 3 for all eight
+ * spacial streams, an example is AR9462.
+ *
+ * As per spec, in section 22.1.1 Introduction to the VHT PHY
+ * A VHT STA shall support at least single spactial stream VHT-MCSs
+ * 0 to 7 (transmit and receive) in all supported channel widths.
+ */
+ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) {
+ vht_cap->vht_supported = false;
+ sdata_info(sdata, "Ignoring VHT IE from %pM due to invalid rx_mcs_map\n",
+ sta->addr);
+ return;
+ }
+
/* finally set up the bandwidth */
switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
@@ -511,8 +527,10 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
- if (changed > 0)
+ if (changed > 0) {
+ ieee80211_recalc_min_chandef(sdata);
rate_control_rate_update(local, sband, sta, changed);
+ }
}
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 9eb0aee9105b..3e3d3014e9ab 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -236,26 +236,35 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ u8 flags;
u8 *p;
- u8 ack_policy, tid;
if (!ieee80211_is_data_qos(hdr->frame_control))
return;
p = ieee80211_get_qos_ctl(hdr);
- tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
- /* preserve EOSP bit */
- ack_policy = *p & IEEE80211_QOS_CTL_EOSP;
+ /* set up the first byte */
+
+ /*
+ * preserve everything but the TID and ACK policy
+ * (which we both write here)
+ */
+ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK |
+ IEEE80211_QOS_CTL_ACK_POLICY_MASK);
if (is_multicast_ether_addr(hdr->addr1) ||
sdata->noack_map & BIT(tid)) {
- ack_policy |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
+ flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
info->flags |= IEEE80211_TX_CTL_NO_ACK;
}
- /* qos header is 2 bytes */
- *p++ = ack_policy | tid;
+ *p = flags | tid;
+
+ /* set up the second byte */
+ p++;
+
if (ieee80211_vif_is_mesh(&sdata->vif)) {
/* preserve RSPI and Mesh PS Level bit */
*p &= ((IEEE80211_QOS_CTL_RSPI |
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b48c1e13e281..8af6dd388d11 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -57,7 +57,7 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
if (info->control.hw_key &&
(info->flags & IEEE80211_TX_CTL_DONTFRAG ||
- tx->local->ops->set_frag_threshold) &&
+ ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) &&
!(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) {
/* hwaccel - with no need for SW-generated MMIC */
return TX_CONTINUE;
@@ -405,7 +405,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb,
u8 *pos;
u8 pn[6];
u64 pn64;
- u8 aad[2 * AES_BLOCK_SIZE];
+ u8 aad[CCM_AAD_LEN];
u8 b_0[AES_BLOCK_SIZE];
if (info->control.hw_key &&
@@ -461,10 +461,8 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb,
pos += IEEE80211_CCMP_HDR_LEN;
ccmp_special_blocks(skb, pn, b_0, aad);
- ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len,
- skb_put(skb, mic_len), mic_len);
-
- return 0;
+ return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len,
+ skb_put(skb, mic_len), mic_len);
}
@@ -639,7 +637,7 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
u8 *pos;
u8 pn[6];
u64 pn64;
- u8 aad[2 * AES_BLOCK_SIZE];
+ u8 aad[GCM_AAD_LEN];
u8 j_0[AES_BLOCK_SIZE];
if (info->control.hw_key &&
@@ -696,10 +694,8 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
pos += IEEE80211_GCMP_HDR_LEN;
gcmp_special_blocks(skb, pn, j_0, aad);
- ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len,
- skb_put(skb, IEEE80211_GCMP_MIC_LEN));
-
- return 0;
+ return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len,
+ skb_put(skb, IEEE80211_GCMP_MIC_LEN));
}
ieee80211_tx_result
@@ -1123,9 +1119,9 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx)
struct ieee80211_key *key = tx->key;
struct ieee80211_mmie_16 *mmie;
struct ieee80211_hdr *hdr;
- u8 aad[20];
+ u8 aad[GMAC_AAD_LEN];
u64 pn64;
- u8 nonce[12];
+ u8 nonce[GMAC_NONCE_LEN];
if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
return TX_DROP;
@@ -1171,7 +1167,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
struct ieee80211_key *key = rx->key;
struct ieee80211_mmie_16 *mmie;
- u8 aad[20], mic[16], ipn[6], nonce[12];
+ u8 aad[GMAC_AAD_LEN], mic[GMAC_MIC_LEN], ipn[6], nonce[GMAC_NONCE_LEN];
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
if (!ieee80211_is_mgmt(hdr->frame_control))
diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile
index 17a51e8389e2..5857bb1e1695 100644
--- a/net/mac802154/Makefile
+++ b/net/mac802154/Makefile
@@ -3,5 +3,3 @@ mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \
iface.o llsec.o util.o cfg.o trace.o
CFLAGS_trace.o := -I$(src)
-
-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 7079cd32a7ad..06019dba4b10 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
/* TODO check this */
SET_NETDEV_DEV(ndev, &local->phy->dev);
+ dev_net_set(ndev, wpan_phy_net(local->hw.phy));
sdata = netdev_priv(ndev);
ndev->ieee802154_ptr = &sdata->wpan_dev;
memcpy(sdata->name, ndev->name, IFNAMSIZ);
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 446e1300383e..4dcf6e18563a 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -101,11 +101,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
sdata->dev->stats.rx_bytes += skb->len;
switch (mac_cb(skb)->type) {
+ case IEEE802154_FC_TYPE_BEACON:
+ case IEEE802154_FC_TYPE_ACK:
+ case IEEE802154_FC_TYPE_MAC_CMD:
+ goto fail;
+
case IEEE802154_FC_TYPE_DATA:
return ieee802154_deliver_skb(skb);
default:
- pr_warn("ieee802154: bad frame received (type = %d)\n",
- mac_cb(skb)->type);
+ pr_warn_ratelimited("ieee802154: bad frame received "
+ "(type = %d)\n", mac_cb(skb)->type);
goto fail;
}
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index f9fd0957ab67..7c03fb0ea34c 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -80,11 +80,11 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
if (skb->len > max_sifs_size)
hrtimer_start(&local->ifs_timer,
- ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC),
+ hw->phy->lifs_period * NSEC_PER_USEC,
HRTIMER_MODE_REL);
else
hrtimer_start(&local->ifs_timer,
- ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC),
+ hw->phy->sifs_period * NSEC_PER_USEC,
HRTIMER_MODE_REL);
} else {
ieee802154_wake_queue(hw);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5c161e7759b5..5b77377e5a15 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -98,18 +98,19 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
}
EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
-static u32 mpls_multipath_hash(struct mpls_route *rt,
- struct sk_buff *skb, bool bos)
+static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
{
struct mpls_entry_decoded dec;
+ unsigned int mpls_hdr_len = 0;
struct mpls_shim_hdr *hdr;
bool eli_seen = false;
int label_index;
u32 hash = 0;
- for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
+ for (label_index = 0; label_index < MAX_MP_SELECT_LABELS;
label_index++) {
- if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
+ mpls_hdr_len += sizeof(*hdr);
+ if (!pskb_may_pull(skb, mpls_hdr_len))
break;
/* Read and decode the current label */
@@ -134,37 +135,38 @@ static u32 mpls_multipath_hash(struct mpls_route *rt,
eli_seen = true;
}
- bos = dec.bos;
- if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index +
- sizeof(struct iphdr))) {
+ if (!dec.bos)
+ continue;
+
+ /* found bottom label; does skb have room for a header? */
+ if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) {
const struct iphdr *v4hdr;
- v4hdr = (const struct iphdr *)(mpls_hdr(skb) +
- label_index);
+ v4hdr = (const struct iphdr *)(hdr + 1);
if (v4hdr->version == 4) {
hash = jhash_3words(ntohl(v4hdr->saddr),
ntohl(v4hdr->daddr),
v4hdr->protocol, hash);
} else if (v4hdr->version == 6 &&
- pskb_may_pull(skb, sizeof(*hdr) * label_index +
- sizeof(struct ipv6hdr))) {
+ pskb_may_pull(skb, mpls_hdr_len +
+ sizeof(struct ipv6hdr))) {
const struct ipv6hdr *v6hdr;
- v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) +
- label_index);
-
+ v6hdr = (const struct ipv6hdr *)(hdr + 1);
hash = __ipv6_addr_jhash(&v6hdr->saddr, hash);
hash = __ipv6_addr_jhash(&v6hdr->daddr, hash);
hash = jhash_1word(v6hdr->nexthdr, hash);
}
}
+
+ break;
}
return hash;
}
static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
- struct sk_buff *skb, bool bos)
+ struct sk_buff *skb)
{
int alive = ACCESS_ONCE(rt->rt_nhn_alive);
u32 hash = 0;
@@ -180,7 +182,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
if (alive <= 0)
return NULL;
- hash = mpls_multipath_hash(rt, skb, bos);
+ hash = mpls_multipath_hash(rt, skb);
nh_index = hash % alive;
if (alive == rt->rt_nhn)
goto out;
@@ -278,17 +280,11 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
hdr = mpls_hdr(skb);
dec = mpls_entry_decode(hdr);
- /* Pop the label */
- skb_pull(skb, sizeof(*hdr));
- skb_reset_network_header(skb);
-
- skb_orphan(skb);
-
rt = mpls_route_input_rcu(net, dec.label);
if (!rt)
goto drop;
- nh = mpls_select_multipath(rt, skb, dec.bos);
+ nh = mpls_select_multipath(rt, skb);
if (!nh)
goto drop;
@@ -297,6 +293,12 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
if (!mpls_output_possible(out_dev))
goto drop;
+ /* Pop the label */
+ skb_pull(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+
+ skb_orphan(skb);
+
if (skb_warn_if_lro(skb))
goto drop;
@@ -961,9 +963,6 @@ static void mpls_ifdown(struct net_device *dev, int event)
RCU_INIT_POINTER(nh->nh_dev, NULL);
} endfor_nexthops(rt);
}
-
-
- return;
}
static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
@@ -997,8 +996,6 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
ACCESS_ONCE(rt->rt_nhn_alive) = alive;
}
-
- return;
}
static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
@@ -1257,7 +1254,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!nla)
continue;
- switch(index) {
+ switch (index) {
case RTA_OIF:
cfg->rc_ifindex = nla_get_u32(nla);
break;
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 732a5c17e986..bdfef6c3271a 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -1,9 +1,6 @@
#ifndef MPLS_INTERNAL_H
#define MPLS_INTERNAL_H
-
-struct mpls_shim_hdr {
- __be32 label_stack_entry;
-};
+#include <net/mpls.h>
struct mpls_entry_decoded {
u32 label;
@@ -93,11 +90,6 @@ struct mpls_route { /* next hop label forwarding entry */
#define endfor_nexthops(rt) }
-static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
-{
- return (struct mpls_shim_hdr *)skb_network_header(skb);
-}
-
static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
{
struct mpls_shim_hdr result;
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 2055e57ed1c3..b4da6d8e8632 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -23,32 +23,50 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
netdev_features_t mpls_features;
+ u16 mac_len = skb->mac_len;
__be16 mpls_protocol;
+ unsigned int mpls_hlen;
+
+ skb_reset_network_header(skb);
+ mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
+ if (unlikely(!pskb_may_pull(skb, mpls_hlen)))
+ goto out;
/* Setup inner SKB. */
mpls_protocol = skb->protocol;
skb->protocol = skb->inner_protocol;
- /* Push back the mac header that skb_mac_gso_segment() has pulled.
- * It will be re-pulled by the call to skb_mac_gso_segment() below
- */
- __skb_push(skb, skb->mac_len);
+ __skb_pull(skb, mpls_hlen);
+
+ skb->mac_len = 0;
+ skb_reset_mac_header(skb);
/* Segment inner packet. */
mpls_features = skb->dev->mpls_features & features;
segs = skb_mac_gso_segment(skb, mpls_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
+ skb = segs;
+
+ mpls_hlen += mac_len;
+ do {
+ skb->mac_len = mac_len;
+ skb->protocol = mpls_protocol;
+ skb_reset_inner_network_header(skb);
- /* Restore outer protocol. */
- skb->protocol = mpls_protocol;
+ __skb_push(skb, mpls_hlen);
- /* Re-pull the mac header that the call to skb_mac_gso_segment()
- * above pulled. It will be re-pushed after returning
- * skb_mac_gso_segment(), an indirect caller of this function.
- */
- __skb_pull(skb, skb->data - skb_mac_header(skb));
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ } while ((skb = skb->next));
+out:
return segs;
}
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 644a8da6d4bd..1d281c1ff7c1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
return en->labels * sizeof(struct mpls_shim_hdr);
}
-static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int mpls_xmit(struct sk_buff *skb)
{
struct mpls_iptunnel_encap *tun_encap_info;
struct mpls_shim_hdr *hdr;
@@ -90,7 +90,11 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (skb_cow(skb, hh_len + new_header_size))
goto drop;
+ skb_set_inner_protocol(skb, skb->protocol);
+ skb_reset_inner_network_header(skb);
+
skb_push(skb, new_header_size);
+
skb_reset_network_header(skb);
skb->dev = out_dev;
@@ -115,7 +119,7 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err);
- return 0;
+ return LWTUNNEL_XMIT_DONE;
drop:
kfree_skb(skb);
@@ -129,7 +133,6 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
struct mpls_iptunnel_encap *tun_encap_info;
struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
struct lwtunnel_state *newts;
- int tun_encap_info_len;
int ret;
ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
@@ -140,20 +143,19 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
if (!tb[MPLS_IPTUNNEL_DST])
return -EINVAL;
- tun_encap_info_len = sizeof(*tun_encap_info);
- newts = lwtunnel_state_alloc(tun_encap_info_len);
+ newts = lwtunnel_state_alloc(sizeof(*tun_encap_info));
if (!newts)
return -ENOMEM;
- newts->len = tun_encap_info_len;
tun_encap_info = mpls_lwtunnel_encap(newts);
ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
&tun_encap_info->labels, tun_encap_info->label);
if (ret)
goto errout;
newts->type = LWTUNNEL_ENCAP_MPLS;
- newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ newts->headroom = mpls_encap_size(tun_encap_info);
*ts = newts;
@@ -209,10 +211,11 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
static const struct lwtunnel_encap_ops mpls_iptun_ops = {
.build_state = mpls_build_state,
- .output = mpls_output,
+ .xmit = mpls_xmit,
.fill_encap = mpls_fill_encap_info,
.get_encap_size = mpls_encap_nlsize,
.cmp_encap = mpls_encap_cmp,
+ .owner = THIS_MODULE,
};
static int __init mpls_iptunnel_init(void)
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 33738c060547..1308a56f2591 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -170,6 +170,7 @@ struct ncsi_package;
#define NCSI_PACKAGE_SHIFT 5
#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7)
+#define NCSI_RESERVED_CHANNEL 0x1f
#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c))
@@ -186,9 +187,15 @@ struct ncsi_channel {
struct ncsi_channel_mode modes[NCSI_MODE_MAX];
struct ncsi_channel_filter *filters[NCSI_FILTER_MAX];
struct ncsi_channel_stats stats;
- struct timer_list timer; /* Link monitor timer */
- bool enabled; /* Timer is enabled */
- unsigned int timeout; /* Times of timeout */
+ struct {
+ struct timer_list timer;
+ bool enabled;
+ unsigned int state;
+#define NCSI_CHANNEL_MONITOR_START 0
+#define NCSI_CHANNEL_MONITOR_RETRY 1
+#define NCSI_CHANNEL_MONITOR_WAIT 2
+#define NCSI_CHANNEL_MONITOR_WAIT_MAX 5
+ } monitor;
struct list_head node;
struct list_head link;
};
@@ -206,7 +213,8 @@ struct ncsi_package {
struct ncsi_request {
unsigned char id; /* Request ID - 0 to 255 */
bool used; /* Request that has been assigned */
- bool driven; /* Drive state machine */
+ unsigned int flags; /* NCSI request property */
+#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
struct ncsi_dev_priv *ndp; /* Associated NCSI device */
struct sk_buff *cmd; /* Associated NCSI command packet */
struct sk_buff *rsp; /* Associated NCSI response packet */
@@ -238,6 +246,7 @@ enum {
ncsi_dev_state_config_gls,
ncsi_dev_state_config_done,
ncsi_dev_state_suspend_select = 0x0401,
+ ncsi_dev_state_suspend_gls,
ncsi_dev_state_suspend_dcnt,
ncsi_dev_state_suspend_dc,
ncsi_dev_state_suspend_deselect,
@@ -256,8 +265,10 @@ struct ncsi_dev_priv {
#endif
unsigned int package_num; /* Number of packages */
struct list_head packages; /* List of packages */
+ struct ncsi_channel *hot_channel; /* Channel was ever active */
struct ncsi_request requests[256]; /* Request table */
unsigned int request_id; /* Last used request ID */
+#define NCSI_REQ_START_IDX 1
unsigned int pending_req_num; /* Number of pending requests */
struct ncsi_package *active_package; /* Currently handled package */
struct ncsi_channel *active_channel; /* Currently handled channel */
@@ -274,7 +285,7 @@ struct ncsi_cmd_arg {
unsigned char package; /* Destination package ID */
unsigned char channel; /* Detination channel ID or 0x1f */
unsigned short payload; /* Command packet payload length */
- bool driven; /* Drive the state machine? */
+ unsigned int req_flags; /* NCSI request properties */
union {
unsigned char bytes[16]; /* Command packet specific data */
unsigned short words[8];
@@ -313,7 +324,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
unsigned char id,
struct ncsi_package **np,
struct ncsi_channel **nc);
-struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven);
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
+ unsigned int req_flags);
void ncsi_free_request(struct ncsi_request *nr);
struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index d463468442ae..6898e7229285 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -53,7 +53,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
struct ncsi_aen_lsc_pkt *lsc;
struct ncsi_channel *nc;
struct ncsi_channel_mode *ncm;
- unsigned long old_data;
+ bool chained;
+ int state;
+ unsigned long old_data, data;
unsigned long flags;
/* Find the NCSI channel */
@@ -62,20 +64,27 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
return -ENODEV;
/* Update the link status */
- ncm = &nc->modes[NCSI_MODE_LINK];
lsc = (struct ncsi_aen_lsc_pkt *)h;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ ncm = &nc->modes[NCSI_MODE_LINK];
old_data = ncm->data[2];
- ncm->data[2] = ntohl(lsc->status);
+ data = ntohl(lsc->status);
+ ncm->data[2] = data;
ncm->data[4] = ntohl(lsc->oem_status);
- if (!((old_data ^ ncm->data[2]) & 0x1) ||
- !list_empty(&nc->link))
+
+ chained = !list_empty(&nc->link);
+ state = nc->state;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ if (!((old_data ^ data) & 0x1) || chained)
return 0;
- if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) &&
- !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1)))
+ if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) &&
+ !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
return 0;
if (!(ndp->flags & NCSI_DEV_HWA) &&
- nc->state == NCSI_CHANNEL_ACTIVE)
+ state == NCSI_CHANNEL_ACTIVE)
ndp->flags |= NCSI_DEV_RESHUFFLE;
ncsi_stop_channel_monitor(nc);
@@ -97,13 +106,21 @@ static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp,
if (!nc)
return -ENODEV;
+ spin_lock_irqsave(&nc->lock, flags);
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_ACTIVE)
+ nc->state != NCSI_CHANNEL_ACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
return 0;
+ }
+ spin_unlock_irqrestore(&nc->lock, flags);
ncsi_stop_channel_monitor(nc);
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
spin_lock_irqsave(&ndp->lock, flags);
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ nc->state = NCSI_CHANNEL_INACTIVE;
list_add_tail_rcu(&nc->link, &ndp->channel_queue);
spin_unlock_irqrestore(&ndp->lock, flags);
@@ -124,23 +141,35 @@ static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp,
return -ENODEV;
/* If the channel is active one, we need reconfigure it */
+ spin_lock_irqsave(&nc->lock, flags);
ncm = &nc->modes[NCSI_MODE_LINK];
hncdsc = (struct ncsi_aen_hncdsc_pkt *)h;
ncm->data[3] = ntohl(hncdsc->status);
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_ACTIVE ||
- (ncm->data[3] & 0x1))
+ nc->state != NCSI_CHANNEL_ACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
return 0;
+ }
- if (ndp->flags & NCSI_DEV_HWA)
+ spin_unlock_irqrestore(&nc->lock, flags);
+ if (!(ndp->flags & NCSI_DEV_HWA) && !(ncm->data[3] & 0x1))
ndp->flags |= NCSI_DEV_RESHUFFLE;
/* If this channel is the active one and the link doesn't
* work, we have to choose another channel to be active one.
* The logic here is exactly similar to what we do when link
* is down on the active channel.
+ *
+ * On the other hand, we need configure it when host driver
+ * state on the active channel becomes ready.
*/
ncsi_stop_channel_monitor(nc);
+
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = (ncm->data[3] & 0x1) ? NCSI_CHANNEL_INACTIVE :
+ NCSI_CHANNEL_ACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
spin_lock_irqsave(&ndp->lock, flags);
list_add_tail_rcu(&nc->link, &ndp->channel_queue);
spin_unlock_irqrestore(&ndp->lock, flags);
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 21057a8ceeac..db7083bfd476 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -272,7 +272,7 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca)
struct sk_buff *skb;
struct ncsi_request *nr;
- nr = ncsi_alloc_request(ndp, nca->driven);
+ nr = ncsi_alloc_request(ndp, nca->req_flags);
if (!nr)
return NULL;
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index ef017b871857..a3bd5fa8ad09 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -132,6 +132,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
struct ncsi_dev *nd = &ndp->ndev;
struct ncsi_package *np;
struct ncsi_channel *nc;
+ unsigned long flags;
nd->state = ncsi_dev_state_functional;
if (force_down) {
@@ -142,14 +143,21 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
nd->link_up = 0;
NCSI_FOR_EACH_PACKAGE(ndp, np) {
NCSI_FOR_EACH_CHANNEL(np, nc) {
+ spin_lock_irqsave(&nc->lock, flags);
+
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_ACTIVE)
+ nc->state != NCSI_CHANNEL_ACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
continue;
+ }
if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+ spin_unlock_irqrestore(&nc->lock, flags);
nd->link_up = 1;
goto report;
}
+
+ spin_unlock_irqrestore(&nc->lock, flags);
}
}
@@ -163,43 +171,55 @@ static void ncsi_channel_monitor(unsigned long data)
struct ncsi_package *np = nc->package;
struct ncsi_dev_priv *ndp = np->ndp;
struct ncsi_cmd_arg nca;
- bool enabled;
- unsigned int timeout;
+ bool enabled, chained;
+ unsigned int monitor_state;
unsigned long flags;
- int ret;
+ int state, ret;
spin_lock_irqsave(&nc->lock, flags);
- timeout = nc->timeout;
- enabled = nc->enabled;
+ state = nc->state;
+ chained = !list_empty(&nc->link);
+ enabled = nc->monitor.enabled;
+ monitor_state = nc->monitor.state;
spin_unlock_irqrestore(&nc->lock, flags);
- if (!enabled || !list_empty(&nc->link))
+ if (!enabled || chained)
return;
- if (nc->state != NCSI_CHANNEL_INACTIVE &&
- nc->state != NCSI_CHANNEL_ACTIVE)
+ if (state != NCSI_CHANNEL_INACTIVE &&
+ state != NCSI_CHANNEL_ACTIVE)
return;
- if (!(timeout % 2)) {
+ switch (monitor_state) {
+ case NCSI_CHANNEL_MONITOR_START:
+ case NCSI_CHANNEL_MONITOR_RETRY:
nca.ndp = ndp;
nca.package = np->id;
nca.channel = nc->id;
nca.type = NCSI_PKT_CMD_GLS;
- nca.driven = false;
+ nca.req_flags = 0;
ret = ncsi_xmit_cmd(&nca);
if (ret) {
netdev_err(ndp->ndev.dev, "Error %d sending GLS\n",
ret);
return;
}
- }
- if (timeout + 1 >= 3) {
+ break;
+ case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX:
+ break;
+ default:
if (!(ndp->flags & NCSI_DEV_HWA) &&
- nc->state == NCSI_CHANNEL_ACTIVE)
+ state == NCSI_CHANNEL_ACTIVE) {
ncsi_report_link(ndp, true);
+ ndp->flags |= NCSI_DEV_RESHUFFLE;
+ }
+
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
spin_lock_irqsave(&ndp->lock, flags);
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ nc->state = NCSI_CHANNEL_INACTIVE;
list_add_tail_rcu(&nc->link, &ndp->channel_queue);
spin_unlock_irqrestore(&ndp->lock, flags);
ncsi_process_next_channel(ndp);
@@ -207,10 +227,9 @@ static void ncsi_channel_monitor(unsigned long data)
}
spin_lock_irqsave(&nc->lock, flags);
- nc->timeout = timeout + 1;
- nc->enabled = true;
+ nc->monitor.state++;
spin_unlock_irqrestore(&nc->lock, flags);
- mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+ mod_timer(&nc->monitor.timer, jiffies + HZ);
}
void ncsi_start_channel_monitor(struct ncsi_channel *nc)
@@ -218,12 +237,12 @@ void ncsi_start_channel_monitor(struct ncsi_channel *nc)
unsigned long flags;
spin_lock_irqsave(&nc->lock, flags);
- WARN_ON_ONCE(nc->enabled);
- nc->timeout = 0;
- nc->enabled = true;
+ WARN_ON_ONCE(nc->monitor.enabled);
+ nc->monitor.enabled = true;
+ nc->monitor.state = NCSI_CHANNEL_MONITOR_START;
spin_unlock_irqrestore(&nc->lock, flags);
- mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+ mod_timer(&nc->monitor.timer, jiffies + HZ);
}
void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
@@ -231,14 +250,14 @@ void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
unsigned long flags;
spin_lock_irqsave(&nc->lock, flags);
- if (!nc->enabled) {
+ if (!nc->monitor.enabled) {
spin_unlock_irqrestore(&nc->lock, flags);
return;
}
- nc->enabled = false;
+ nc->monitor.enabled = false;
spin_unlock_irqrestore(&nc->lock, flags);
- del_timer_sync(&nc->timer);
+ del_timer_sync(&nc->monitor.timer);
}
struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
@@ -267,8 +286,9 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
nc->id = id;
nc->package = np;
nc->state = NCSI_CHANNEL_INACTIVE;
- nc->enabled = false;
- setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc);
+ nc->monitor.enabled = false;
+ setup_timer(&nc->monitor.timer,
+ ncsi_channel_monitor, (unsigned long)nc);
spin_lock_init(&nc->lock);
INIT_LIST_HEAD(&nc->link);
for (index = 0; index < NCSI_CAP_MAX; index++)
@@ -405,7 +425,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
* be same. Otherwise, the bogus response might be replied. So
* the available IDs are allocated in round-robin fashion.
*/
-struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
+ unsigned int req_flags)
{
struct ncsi_request *nr = NULL;
int i, limit = ARRAY_SIZE(ndp->requests);
@@ -413,30 +434,31 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
/* Check if there is one available request until the ceiling */
spin_lock_irqsave(&ndp->lock, flags);
- for (i = ndp->request_id; !nr && i < limit; i++) {
+ for (i = ndp->request_id; i < limit; i++) {
if (ndp->requests[i].used)
continue;
nr = &ndp->requests[i];
nr->used = true;
- nr->driven = driven;
- if (++ndp->request_id >= limit)
- ndp->request_id = 0;
+ nr->flags = req_flags;
+ ndp->request_id = i + 1;
+ goto found;
}
/* Fail back to check from the starting cursor */
- for (i = 0; !nr && i < ndp->request_id; i++) {
+ for (i = NCSI_REQ_START_IDX; i < ndp->request_id; i++) {
if (ndp->requests[i].used)
continue;
nr = &ndp->requests[i];
nr->used = true;
- nr->driven = driven;
- if (++ndp->request_id >= limit)
- ndp->request_id = 0;
+ nr->flags = req_flags;
+ ndp->request_id = i + 1;
+ goto found;
}
- spin_unlock_irqrestore(&ndp->lock, flags);
+found:
+ spin_unlock_irqrestore(&ndp->lock, flags);
return nr;
}
@@ -458,7 +480,7 @@ void ncsi_free_request(struct ncsi_request *nr)
nr->cmd = NULL;
nr->rsp = NULL;
nr->used = false;
- driven = nr->driven;
+ driven = !!(nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN);
spin_unlock_irqrestore(&ndp->lock, flags);
if (driven && cmd && --ndp->pending_req_num == 0)
@@ -508,55 +530,102 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
struct ncsi_package *np = ndp->active_package;
struct ncsi_channel *nc = ndp->active_channel;
struct ncsi_cmd_arg nca;
+ unsigned long flags;
int ret;
nca.ndp = ndp;
- nca.driven = true;
+ nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
case ncsi_dev_state_suspend:
nd->state = ncsi_dev_state_suspend_select;
/* Fall through */
case ncsi_dev_state_suspend_select:
- case ncsi_dev_state_suspend_dcnt:
- case ncsi_dev_state_suspend_dc:
- case ncsi_dev_state_suspend_deselect:
ndp->pending_req_num = 1;
- np = ndp->active_package;
- nc = ndp->active_channel;
+ nca.type = NCSI_PKT_CMD_SP;
nca.package = np->id;
- if (nd->state == ncsi_dev_state_suspend_select) {
- nca.type = NCSI_PKT_CMD_SP;
- nca.channel = 0x1f;
- if (ndp->flags & NCSI_DEV_HWA)
- nca.bytes[0] = 0;
- else
- nca.bytes[0] = 1;
+ nca.channel = NCSI_RESERVED_CHANNEL;
+ if (ndp->flags & NCSI_DEV_HWA)
+ nca.bytes[0] = 0;
+ else
+ nca.bytes[0] = 1;
+
+ /* To retrieve the last link states of channels in current
+ * package when current active channel needs fail over to
+ * another one. It means we will possibly select another
+ * channel as next active one. The link states of channels
+ * are most important factor of the selection. So we need
+ * accurate link states. Unfortunately, the link states on
+ * inactive channels can't be updated with LSC AEN in time.
+ */
+ if (ndp->flags & NCSI_DEV_RESHUFFLE)
+ nd->state = ncsi_dev_state_suspend_gls;
+ else
nd->state = ncsi_dev_state_suspend_dcnt;
- } else if (nd->state == ncsi_dev_state_suspend_dcnt) {
- nca.type = NCSI_PKT_CMD_DCNT;
- nca.channel = nc->id;
- nd->state = ncsi_dev_state_suspend_dc;
- } else if (nd->state == ncsi_dev_state_suspend_dc) {
- nca.type = NCSI_PKT_CMD_DC;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+
+ break;
+ case ncsi_dev_state_suspend_gls:
+ ndp->pending_req_num = np->channel_num;
+
+ nca.type = NCSI_PKT_CMD_GLS;
+ nca.package = np->id;
+
+ nd->state = ncsi_dev_state_suspend_dcnt;
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
nca.channel = nc->id;
- nca.bytes[0] = 1;
- nd->state = ncsi_dev_state_suspend_deselect;
- } else if (nd->state == ncsi_dev_state_suspend_deselect) {
- nca.type = NCSI_PKT_CMD_DP;
- nca.channel = 0x1f;
- nd->state = ncsi_dev_state_suspend_done;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
}
+ break;
+ case ncsi_dev_state_suspend_dcnt:
+ ndp->pending_req_num = 1;
+
+ nca.type = NCSI_PKT_CMD_DCNT;
+ nca.package = np->id;
+ nca.channel = nc->id;
+
+ nd->state = ncsi_dev_state_suspend_dc;
ret = ncsi_xmit_cmd(&nca);
- if (ret) {
- nd->state = ncsi_dev_state_functional;
- return;
- }
+ if (ret)
+ goto error;
+
+ break;
+ case ncsi_dev_state_suspend_dc:
+ ndp->pending_req_num = 1;
+
+ nca.type = NCSI_PKT_CMD_DC;
+ nca.package = np->id;
+ nca.channel = nc->id;
+ nca.bytes[0] = 1;
+
+ nd->state = ncsi_dev_state_suspend_deselect;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
+
+ break;
+ case ncsi_dev_state_suspend_deselect:
+ ndp->pending_req_num = 1;
+
+ nca.type = NCSI_PKT_CMD_DP;
+ nca.package = np->id;
+ nca.channel = NCSI_RESERVED_CHANNEL;
+
+ nd->state = ncsi_dev_state_suspend_done;
+ ret = ncsi_xmit_cmd(&nca);
+ if (ret)
+ goto error;
break;
case ncsi_dev_state_suspend_done:
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
ncsi_process_next_channel(ndp);
break;
@@ -564,6 +633,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n",
nd->state);
}
+
+ return;
+error:
+ nd->state = ncsi_dev_state_functional;
}
static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
@@ -572,12 +645,14 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
struct net_device *dev = nd->dev;
struct ncsi_package *np = ndp->active_package;
struct ncsi_channel *nc = ndp->active_channel;
+ struct ncsi_channel *hot_nc = NULL;
struct ncsi_cmd_arg nca;
unsigned char index;
+ unsigned long flags;
int ret;
nca.ndp = ndp;
- nca.driven = true;
+ nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
case ncsi_dev_state_config:
case ncsi_dev_state_config_sp:
@@ -590,7 +665,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
else
nca.bytes[0] = 1;
nca.package = np->id;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
@@ -675,10 +750,20 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
goto error;
break;
case ncsi_dev_state_config_done:
- if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1)
- xchg(&nc->state, NCSI_CHANNEL_ACTIVE);
- else
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ spin_lock_irqsave(&nc->lock, flags);
+ if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+ hot_nc = nc;
+ nc->state = NCSI_CHANNEL_ACTIVE;
+ } else {
+ hot_nc = NULL;
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ }
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ /* Update the hot channel */
+ spin_lock_irqsave(&ndp->lock, flags);
+ ndp->hot_channel = hot_nc;
+ spin_unlock_irqrestore(&ndp->lock, flags);
ncsi_start_channel_monitor(nc);
ncsi_process_next_channel(ndp);
@@ -697,28 +782,42 @@ error:
static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
{
struct ncsi_package *np;
- struct ncsi_channel *nc, *found;
+ struct ncsi_channel *nc, *found, *hot_nc;
struct ncsi_channel_mode *ncm;
unsigned long flags;
+ spin_lock_irqsave(&ndp->lock, flags);
+ hot_nc = ndp->hot_channel;
+ spin_unlock_irqrestore(&ndp->lock, flags);
+
/* The search is done once an inactive channel with up
* link is found.
*/
found = NULL;
NCSI_FOR_EACH_PACKAGE(ndp, np) {
NCSI_FOR_EACH_CHANNEL(np, nc) {
+ spin_lock_irqsave(&nc->lock, flags);
+
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_INACTIVE)
+ nc->state != NCSI_CHANNEL_INACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
continue;
+ }
if (!found)
found = nc;
+ if (nc == hot_nc)
+ found = nc;
+
ncm = &nc->modes[NCSI_MODE_LINK];
if (ncm->data[2] & 0x1) {
+ spin_unlock_irqrestore(&nc->lock, flags);
found = nc;
goto out;
}
+
+ spin_unlock_irqrestore(&nc->lock, flags);
}
}
@@ -797,7 +896,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
int ret;
nca.ndp = ndp;
- nca.driven = true;
+ nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
case ncsi_dev_state_probe:
nd->state = ncsi_dev_state_probe_deselect;
@@ -807,7 +906,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
/* Deselect all possible packages */
nca.type = NCSI_PKT_CMD_DP;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
for (index = 0; index < 8; index++) {
nca.package = index;
ret = ncsi_xmit_cmd(&nca);
@@ -823,7 +922,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
/* Select all possible packages */
nca.type = NCSI_PKT_CMD_SP;
nca.bytes[0] = 1;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
for (index = 0; index < 8; index++) {
nca.package = index;
ret = ncsi_xmit_cmd(&nca);
@@ -876,7 +975,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
nca.type = NCSI_PKT_CMD_SP;
nca.bytes[0] = 1;
nca.package = ndp->active_package->id;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
@@ -884,12 +983,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
nd->state = ncsi_dev_state_probe_cis;
break;
case ncsi_dev_state_probe_cis:
- ndp->pending_req_num = 32;
+ ndp->pending_req_num = NCSI_RESERVED_CHANNEL;
/* Clear initial state */
nca.type = NCSI_PKT_CMD_CIS;
nca.package = ndp->active_package->id;
- for (index = 0; index < 0x20; index++) {
+ for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) {
nca.channel = index;
ret = ncsi_xmit_cmd(&nca);
if (ret)
@@ -933,7 +1032,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
/* Deselect the active package */
nca.type = NCSI_PKT_CMD_DP;
nca.package = ndp->active_package->id;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
@@ -987,11 +1086,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
goto out;
}
- old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE);
list_del_init(&nc->link);
-
spin_unlock_irqrestore(&ndp->lock, flags);
+ spin_lock_irqsave(&nc->lock, flags);
+ old_state = nc->state;
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
ndp->active_channel = nc;
ndp->active_package = nc->package;
@@ -1006,7 +1108,7 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
break;
default:
netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n",
- nc->state, nc->package->id, nc->id);
+ old_state, nc->package->id, nc->id);
ncsi_report_link(ndp, false);
return -EINVAL;
}
@@ -1070,7 +1172,7 @@ static int ncsi_inet6addr_event(struct notifier_block *this,
return NOTIFY_OK;
nca.ndp = ndp;
- nca.driven = false;
+ nca.req_flags = 0;
nca.package = np->id;
nca.channel = nc->id;
nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
@@ -1118,7 +1220,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
/* Initialize private NCSI device */
spin_lock_init(&ndp->lock);
INIT_LIST_HEAD(&ndp->packages);
- ndp->request_id = 0;
+ ndp->request_id = NCSI_REQ_START_IDX;
for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
ndp->requests[i].id = i;
ndp->requests[i].ndp = ndp;
@@ -1149,9 +1251,7 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev);
int ncsi_start_dev(struct ncsi_dev *nd)
{
struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
- struct ncsi_package *np;
- struct ncsi_channel *nc;
- int old_state, ret;
+ int ret;
if (nd->state != ncsi_dev_state_registered &&
nd->state != ncsi_dev_state_functional)
@@ -1163,15 +1263,6 @@ int ncsi_start_dev(struct ncsi_dev *nd)
return 0;
}
- /* Reset channel's state and start over */
- NCSI_FOR_EACH_PACKAGE(ndp, np) {
- NCSI_FOR_EACH_CHANNEL(np, nc) {
- old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
- WARN_ON_ONCE(!list_empty(&nc->link) ||
- old_state == NCSI_CHANNEL_INVISIBLE);
- }
- }
-
if (ndp->flags & NCSI_DEV_HWA)
ret = ncsi_enable_hwa(ndp);
else
@@ -1181,6 +1272,35 @@ int ncsi_start_dev(struct ncsi_dev *nd)
}
EXPORT_SYMBOL_GPL(ncsi_start_dev);
+void ncsi_stop_dev(struct ncsi_dev *nd)
+{
+ struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ bool chained;
+ int old_state;
+ unsigned long flags;
+
+ /* Stop the channel monitor and reset channel's state */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ ncsi_stop_channel_monitor(nc);
+
+ spin_lock_irqsave(&nc->lock, flags);
+ chained = !list_empty(&nc->link);
+ old_state = nc->state;
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ WARN_ON_ONCE(chained ||
+ old_state == NCSI_CHANNEL_INVISIBLE);
+ }
+ }
+
+ ncsi_report_link(ndp, true);
+}
+EXPORT_SYMBOL_GPL(ncsi_stop_dev);
+
void ncsi_unregister_dev(struct ncsi_dev *nd)
{
struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index af84389a6bf1..087db775b3dc 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -317,12 +317,12 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr)
ncm->data[3] = ntohl(rsp->other);
ncm->data[4] = ntohl(rsp->oem_status);
- if (nr->driven)
+ if (nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN)
return 0;
/* Reset the channel monitor if it has been enabled */
spin_lock_irqsave(&nc->lock, flags);
- nc->timeout = 0;
+ nc->monitor.state = NCSI_CHANNEL_MONITOR_START;
spin_unlock_irqrestore(&nc->lock, flags);
return 0;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9266ceebd112..bbc45f8a7b2d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -57,6 +57,10 @@ config NF_CONNTRACK
config NF_LOG_COMMON
tristate
+config NF_LOG_NETDEV
+ tristate "Netdev packet logging"
+ select NF_LOG_COMMON
+
if NF_CONNTRACK
config NF_CONNTRACK_MARK
@@ -142,38 +146,38 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It selected by the connlabel match.
config NF_CT_PROTO_DCCP
- tristate 'DCCP protocol connection tracking support'
+ bool 'DCCP protocol connection tracking support'
depends on NETFILTER_ADVANCED
- default IP_DCCP
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on DCCP connections.
- If unsure, say 'N'.
+ If unsure, say Y.
config NF_CT_PROTO_GRE
tristate
config NF_CT_PROTO_SCTP
- tristate 'SCTP protocol connection tracking support'
+ bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
- default IP_SCTP
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
- If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ If unsure, say Y.
config NF_CT_PROTO_UDPLITE
- tristate 'UDP-Lite protocol connection tracking support'
+ bool 'UDP-Lite protocol connection tracking support'
depends on NETFILTER_ADVANCED
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on UDP-Lite
connections.
- To compile it as a module, choose M here. If unsure, say N.
+ If unsure, say Y.
config NF_CONNTRACK_AMANDA
tristate "Amanda backup protocol support"
@@ -380,17 +384,17 @@ config NF_NAT_NEEDED
default y
config NF_NAT_PROTO_DCCP
- tristate
+ bool
depends on NF_NAT && NF_CT_PROTO_DCCP
default NF_NAT && NF_CT_PROTO_DCCP
config NF_NAT_PROTO_UDPLITE
- tristate
+ bool
depends on NF_NAT && NF_CT_PROTO_UDPLITE
default NF_NAT && NF_CT_PROTO_UDPLITE
config NF_NAT_PROTO_SCTP
- tristate
+ bool
default NF_NAT && NF_CT_PROTO_SCTP
depends on NF_NAT && NF_CT_PROTO_SCTP
select LIBCRC32C
@@ -474,20 +478,32 @@ config NFT_META
This option adds the "meta" expression that you can use to match and
to set packet metainformation such as the packet mark.
+config NFT_RT
+ tristate "Netfilter nf_tables routing module"
+ help
+ This option adds the "rt" expression that you can use to match
+ packet routing information such as the packet nexthop.
+
+config NFT_NUMGEN
+ tristate "Netfilter nf_tables number generator module"
+ help
+ This option adds the number generator expression used to perform
+ incremental counting and random numbers bound to a upper limit.
+
config NFT_CT
depends on NF_CONNTRACK
tristate "Netfilter nf_tables conntrack module"
help
- This option adds the "meta" expression that you can use to match
+ This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
-config NFT_RBTREE
+config NFT_SET_RBTREE
tristate "Netfilter nf_tables rbtree set module"
help
This option adds the "rbtree" set type (Red Black tree) that is used
to build interval-based sets.
-config NFT_HASH
+config NFT_SET_HASH
tristate "Netfilter nf_tables hash set module"
help
This option adds the "hash" set type that is used to build one-way
@@ -535,6 +551,12 @@ config NFT_NAT
This option adds the "nat" expression that you can use to perform
typical Network Address Translation (NAT) packet transformations.
+config NFT_OBJREF
+ tristate "Netfilter nf_tables stateful object reference module"
+ help
+ This option adds the "objref" expression that allows you to refer to
+ stateful objects, such as counters and quotas.
+
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
@@ -542,6 +564,12 @@ config NFT_QUEUE
This is required if you intend to use the userspace queueing
infrastructure (also known as NFQUEUE) from nftables.
+config NFT_QUOTA
+ tristate "Netfilter nf_tables quota module"
+ help
+ This option adds the "quota" expression that you can use to match
+ enforce bytes quotas.
+
config NFT_REJECT
default m if NETFILTER_ADVANCED=n
tristate "Netfilter nf_tables reject support"
@@ -563,6 +591,25 @@ config NFT_COMPAT
x_tables match/target extensions over the nf_tables
framework.
+config NFT_HASH
+ tristate "Netfilter nf_tables hash module"
+ help
+ This option adds the "hash" expression that you can use to perform
+ a hash operation on registers.
+
+config NFT_FIB
+ tristate
+
+config NFT_FIB_INET
+ depends on NF_TABLES_INET
+ depends on NFT_FIB_IPV4
+ depends on NFT_FIB_IPV6
+ tristate "Netfilter nf_tables fib inet support"
+ help
+ This option allows using the FIB expression from the inet table.
+ The lookup will be delegated to the IPv4 or IPv6 FIB depending
+ on the protocol of the packet.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -1391,9 +1438,10 @@ config NETFILTER_XT_MATCH_SOCKET
tristate '"socket" match support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
- depends on !NF_CONNTRACK || NF_CONNTRACK
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
+ depends on NF_SOCKET_IPV4
+ depends on NF_SOCKET_IPV6
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 69134541d65b..ca30d1960f1d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,6 +5,9 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -16,11 +19,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
-# SCTP protocol connection tracking
-obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
-obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
-obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
# netlink interface for nf_conntrack
obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
@@ -45,17 +44,20 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
+# NAT protocols (nf_nat)
+nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+
# generic transport layer logging
obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
+# packet logging for netdev family
+obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
+
obj-$(CONFIG_NF_NAT) += nf_nat.o
obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
-
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -71,8 +73,9 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
# nf_tables
nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o
-nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o
nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+nf_tables-objs += nft_lookup.o nft_dynset.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
@@ -80,18 +83,25 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
obj-$(CONFIG_NFT_META) += nft_meta.o
+obj-$(CONFIG_NFT_RT) += nft_rt.o
+obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
+obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
+obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
-obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o
-obj-$(CONFIG_NFT_HASH) += nft_hash.o
+obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o
+obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
+obj-$(CONFIG_NFT_HASH) += nft_hash.o
+obj-$(CONFIG_NFT_FIB) += nft_fib.o
+obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..ce6adfae521a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -22,6 +22,7 @@
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -61,53 +62,58 @@ EXPORT_SYMBOL(nf_hooks_needed);
#endif
static DEFINE_MUTEX(nf_hook_mutex);
+#define nf_entry_dereference(e) \
+ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
-static struct list_head *nf_find_hook_list(struct net *net,
- const struct nf_hook_ops *reg)
+static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg)
{
- struct list_head *hook_list = NULL;
-
if (reg->pf != NFPROTO_NETDEV)
- hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
- else if (reg->hooknum == NF_NETDEV_INGRESS) {
+ return net->nf.hooks[reg->pf]+reg->hooknum;
+
#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == NF_NETDEV_INGRESS) {
if (reg->dev && dev_net(reg->dev) == net)
- hook_list = &reg->dev->nf_hooks_ingress;
-#endif
+ return &reg->dev->nf_hooks_ingress;
}
- return hook_list;
+#endif
+ return NULL;
}
-struct nf_hook_entry {
- const struct nf_hook_ops *orig_ops;
- struct nf_hook_ops ops;
-};
-
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
- struct list_head *hook_list;
- struct nf_hook_entry *entry;
- struct nf_hook_ops *elem;
+ struct nf_hook_entry __rcu **pp;
+ struct nf_hook_entry *entry, *p;
+
+ if (reg->pf == NFPROTO_NETDEV) {
+#ifndef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == NF_NETDEV_INGRESS)
+ return -EOPNOTSUPP;
+#endif
+ if (reg->hooknum != NF_NETDEV_INGRESS ||
+ !reg->dev || dev_net(reg->dev) != net)
+ return -EINVAL;
+ }
+
+ pp = nf_hook_entry_head(net, reg);
+ if (!pp)
+ return -EINVAL;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
- entry->orig_ops = reg;
- entry->ops = *reg;
-
- hook_list = nf_find_hook_list(net, reg);
- if (!hook_list) {
- kfree(entry);
- return -ENOENT;
- }
+ nf_hook_entry_init(entry, reg);
mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, hook_list, list) {
- if (reg->priority < elem->priority)
+
+ /* Find the spot in the list */
+ for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
+ if (reg->priority < nf_hook_entry_priority(p))
break;
}
- list_add_rcu(&entry->ops.list, elem->list.prev);
+ rcu_assign_pointer(entry->next, p);
+ rcu_assign_pointer(*pp, entry);
+
mutex_unlock(&nf_hook_mutex);
#ifdef CONFIG_NETFILTER_INGRESS
if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -122,24 +128,22 @@ EXPORT_SYMBOL(nf_register_net_hook);
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
- struct list_head *hook_list;
- struct nf_hook_entry *entry;
- struct nf_hook_ops *elem;
+ struct nf_hook_entry __rcu **pp;
+ struct nf_hook_entry *p;
- hook_list = nf_find_hook_list(net, reg);
- if (!hook_list)
+ pp = nf_hook_entry_head(net, reg);
+ if (WARN_ON_ONCE(!pp))
return;
mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, hook_list, list) {
- entry = container_of(elem, struct nf_hook_entry, ops);
- if (entry->orig_ops == reg) {
- list_del_rcu(&entry->ops.list);
+ for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
+ if (nf_hook_entry_ops(p) == reg) {
+ rcu_assign_pointer(*pp, p->next);
break;
}
}
mutex_unlock(&nf_hook_mutex);
- if (&elem->list == hook_list) {
+ if (!p) {
WARN(1, "nf_unregister_net_hook: hook not found!\n");
return;
}
@@ -151,10 +155,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
synchronize_net();
- nf_queue_nf_hook_drop(net, &entry->ops);
+ nf_queue_nf_hook_drop(net, p);
/* other cpu might still process nfqueue verdict that used reg */
synchronize_net();
- kfree(entry);
+ kfree(p);
}
EXPORT_SYMBOL(nf_unregister_net_hook);
@@ -188,19 +192,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
static LIST_HEAD(nf_hook_list);
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
{
struct net *net, *last;
int ret;
- rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(&reg->list, &nf_hook_list);
- rtnl_unlock();
return 0;
rollback:
@@ -210,19 +212,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+ return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
}
EXPORT_SYMBOL(nf_register_hook);
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
{
struct net *net;
- rtnl_lock();
list_del(&reg->list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ rtnl_lock();
+ _nf_unregister_hook(reg);
rtnl_unlock();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +263,26 @@ err:
}
EXPORT_SYMBOL(nf_register_hooks);
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = _nf_register_hook(&reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ _nf_unregister_hooks(reg, i);
+ return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
{
while (n-- > 0)
@@ -253,76 +290,48 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
}
EXPORT_SYMBOL(nf_unregister_hooks);
-unsigned int nf_iterate(struct list_head *head,
- struct sk_buff *skb,
- struct nf_hook_state *state,
- struct nf_hook_ops **elemp)
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
{
- unsigned int verdict;
-
- /*
- * The caller must not block between calls to this
- * function because of risk of continuing from deleted element.
- */
- list_for_each_entry_continue_rcu((*elemp), head, list) {
- if (state->thresh > (*elemp)->priority)
- continue;
-
- /* Optimization: we don't need to hold module
- reference here, since function can't sleep. --RR */
-repeat:
- verdict = (*elemp)->hook((*elemp)->priv, skb, state);
- if (verdict != NF_ACCEPT) {
-#ifdef CONFIG_NETFILTER_DEBUG
- if (unlikely((verdict & NF_VERDICT_MASK)
- > NF_MAX_VERDICT)) {
- NFDEBUG("Evil return from %p(%u).\n",
- (*elemp)->hook, state->hook);
- continue;
- }
-#endif
- if (verdict != NF_REPEAT)
- return verdict;
- goto repeat;
- }
- }
- return NF_ACCEPT;
+ while (n-- > 0)
+ _nf_unregister_hook(&reg[n]);
}
-
+EXPORT_SYMBOL(_nf_unregister_hooks);
/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
+ * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
+int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
+ struct nf_hook_entry *entry)
{
- struct nf_hook_ops *elem;
unsigned int verdict;
- int ret = 0;
-
- /* We may already have this, but read-locks nest anyway */
- rcu_read_lock();
+ int ret;
- elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
-next_hook:
- verdict = nf_iterate(state->hook_list, skb, state, &elem);
- if (verdict == NF_ACCEPT || verdict == NF_STOP) {
- ret = 1;
- } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
- kfree_skb(skb);
- ret = NF_DROP_GETERR(verdict);
- if (ret == 0)
- ret = -EPERM;
- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- int err = nf_queue(skb, elem, state,
- verdict >> NF_VERDICT_QBITS);
- if (err < 0) {
- if (err == -ESRCH &&
- (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
- goto next_hook;
+ do {
+ verdict = nf_hook_entry_hookfn(entry, skb, state);
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ entry = rcu_dereference(entry->next);
+ break;
+ case NF_DROP:
kfree_skb(skb);
+ ret = NF_DROP_GETERR(verdict);
+ if (ret == 0)
+ ret = -EPERM;
+ return ret;
+ case NF_QUEUE:
+ ret = nf_queue(skb, state, &entry, verdict);
+ if (ret == 1 && entry)
+ continue;
+ return ret;
+ default:
+ /* Implicit handling for NF_STOLEN, as well as any other
+ * non conventional verdicts.
+ */
+ return 0;
}
- }
- rcu_read_unlock();
- return ret;
+ } while (entry);
+
+ return 1;
}
EXPORT_SYMBOL(nf_hook_slow);
@@ -441,7 +450,7 @@ static int __net_init netfilter_net_init(struct net *net)
for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+ RCU_INIT_POINTER(net->nf.hooks[i][h], NULL);
}
#ifdef CONFIG_PROC_FS
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 234a8ec82076..4083a8051f0f 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -99,6 +99,15 @@ config IP_SET_HASH_IPPORTNET
To compile it as a module, choose M here. If unsure, say N.
+config IP_SET_HASH_IPMAC
+ tristate "hash:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,mac set type support, by which
+ one can store IPv4/IPv6 address and MAC (ethernet address) pairs in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_SET_HASH_MAC
tristate "hash:mac set support"
depends on IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
index 3dbd5e958489..28ec148df02d 100644
--- a/net/netfilter/ipset/Makefile
+++ b/net/netfilter/ipset/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
# hash types
obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPMAC) += ip_set_hash_ipmac.o
obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 2e8e7e5fb4a6..6f09a99298cd 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -22,6 +22,7 @@
#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_memsize IPSET_TOKEN(MTYPE, _memsize)
#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
@@ -40,11 +41,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct mtype *map = set->data;
- init_timer(&map->gc);
- map->gc.data = (unsigned long)set;
- map->gc.function = gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&map->gc);
+ setup_timer(&map->gc, gc, (unsigned long)set);
+ mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
}
static void
@@ -82,6 +80,16 @@ mtype_flush(struct ip_set *set)
if (set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
memset(map->members, 0, map->memsize);
+ set->elements = 0;
+ set->ext_size = 0;
+}
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_memsize(const struct mtype *map, size_t dsize)
+{
+ return sizeof(*map) + map->memsize +
+ map->elements * dsize;
}
static int
@@ -89,14 +97,15 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
{
const struct mtype *map = set->data;
struct nlattr *nested;
- size_t memsize = sizeof(*map) + map->memsize;
+ size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -140,6 +149,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (ret == IPSET_ADD_FAILED) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(x, set))) {
+ set->elements--;
ret = 0;
} else if (!(flags & IPSET_FLAG_EXIST)) {
set_bit(e->id, map->members);
@@ -148,6 +158,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
/* Element is re-added, cleanup extensions */
ip_set_ext_destroy(set, x);
}
+ if (ret > 0)
+ set->elements--;
if (SET_WITH_TIMEOUT(set))
#ifdef IP_SET_BITMAP_STORED_TIMEOUT
@@ -159,12 +171,13 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(x, set), ext);
if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(x, set), ext);
+ ip_set_init_comment(set, ext_comment(x, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
/* Activate element */
set_bit(e->id, map->members);
+ set->elements++;
return 0;
}
@@ -181,6 +194,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
return -IPSET_ERR_EXIST;
ip_set_ext_destroy(set, x);
+ set->elements--;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(x, set)))
return -IPSET_ERR_EXIST;
@@ -276,6 +290,7 @@ mtype_gc(unsigned long ul_set)
if (ip_set_timeout_expired(ext_timeout(x, set))) {
clear_bit(id, map->members);
ip_set_ext_destroy(set, x);
+ set->elements--;
}
}
spin_unlock_bh(&set->lock);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a748b0c2c981..c296f9b606d4 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -36,7 +36,7 @@ struct ip_set_net {
bool is_destroyed; /* all sets are destroyed */
};
-static int ip_set_net_id __read_mostly;
+static unsigned int ip_set_net_id __read_mostly;
static inline struct ip_set_net *ip_set_pernet(struct net *net)
{
@@ -324,7 +324,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
-typedef void (*destroyer)(void *);
+typedef void (*destroyer)(struct ip_set *, void *);
/* ipset data extension types, in size order */
const struct ip_set_ext_type ip_set_extensions[] = {
@@ -426,20 +426,20 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
- ext->skbmark = fullmark >> 32;
- ext->skbmarkmask = fullmark & 0xffffffff;
+ ext->skbinfo.skbmark = fullmark >> 32;
+ ext->skbinfo.skbmarkmask = fullmark & 0xffffffff;
}
if (tb[IPSET_ATTR_SKBPRIO]) {
if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
- ext->skbprio = be32_to_cpu(nla_get_be32(
- tb[IPSET_ATTR_SKBPRIO]));
+ ext->skbinfo.skbprio =
+ be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO]));
}
if (tb[IPSET_ATTR_SKBQUEUE]) {
if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
- ext->skbqueue = be16_to_cpu(nla_get_be16(
- tb[IPSET_ATTR_SKBQUEUE]));
+ ext->skbinfo.skbqueue =
+ be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE]));
}
return 0;
}
@@ -541,7 +541,7 @@ int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(par->net, index);
+ struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
int ret = 0;
BUG_ON(!set);
@@ -579,7 +579,7 @@ int
ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(par->net, index);
+ struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
int ret;
BUG_ON(!set);
@@ -601,7 +601,7 @@ int
ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(par->net, index);
+ struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
int ret = 0;
BUG_ON(!set);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index d32fd6b036bf..1b05d4a7d5a1 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -85,6 +85,8 @@ struct htable {
};
#define hbucket(h, i) ((h)->bucket[i])
+#define ext_size(n, dsize) \
+ (sizeof(struct hbucket) + (n) * (dsize))
#ifndef IPSET_NET_COUNT
#define IPSET_NET_COUNT 1
@@ -150,24 +152,34 @@ htable_bits(u32 hashsize)
#define INIT_CIDR(cidr, host_mask) \
DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
-#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
-
#ifdef IP_SET_HASH_WITH_NET0
-/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
-#define NLEN(family) (SET_HOST_MASK(family) + 1)
+/* cidr from 0 to HOST_MASK value and c = cidr + 1 */
+#define NLEN (HOST_MASK + 1)
#define CIDR_POS(c) ((c) - 1)
#else
-/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
-#define NLEN(family) SET_HOST_MASK(family)
+/* cidr from 1 to HOST_MASK value and c = cidr + 1 */
+#define NLEN HOST_MASK
#define CIDR_POS(c) ((c) - 2)
#endif
#else
-#define NLEN(family) 0
+#define NLEN 0
#endif /* IP_SET_HASH_WITH_NETS */
#endif /* _IP_SET_HASH_GEN_H */
+#ifndef MTYPE
+#error "MTYPE is not defined!"
+#endif
+
+#ifndef HTYPE
+#error "HTYPE is not defined!"
+#endif
+
+#ifndef HOST_MASK
+#error "HOST_MASK is not defined!"
+#endif
+
/* Family dependent templates */
#undef ahash_data
@@ -191,7 +203,6 @@ htable_bits(u32 hashsize)
#undef mtype_same_set
#undef mtype_kadt
#undef mtype_uadt
-#undef mtype
#undef mtype_add
#undef mtype_del
@@ -207,6 +218,7 @@ htable_bits(u32 hashsize)
#undef mtype_variant
#undef mtype_data_match
+#undef htype
#undef HKEY
#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
@@ -233,7 +245,6 @@ htable_bits(u32 hashsize)
#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
-#define mtype MTYPE
#define mtype_add IPSET_TOKEN(MTYPE, _add)
#define mtype_del IPSET_TOKEN(MTYPE, _del)
@@ -249,62 +260,54 @@ htable_bits(u32 hashsize)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
-#ifndef MTYPE
-#error "MTYPE is not defined!"
-#endif
-
-#ifndef HOST_MASK
-#error "HOST_MASK is not defined!"
-#endif
-
#ifndef HKEY_DATALEN
#define HKEY_DATALEN sizeof(struct mtype_elem)
#endif
-#define HKEY(data, initval, htable_bits) \
-(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \
- & jhash_mask(htable_bits))
+#define htype MTYPE
-#ifndef htype
-#ifndef HTYPE
-#error "HTYPE is not defined!"
-#endif /* HTYPE */
-#define htype HTYPE
+#define HKEY(data, initval, htable_bits) \
+({ \
+ const u32 *__k = (const u32 *)data; \
+ u32 __l = HKEY_DATALEN / sizeof(u32); \
+ \
+ BUILD_BUG_ON(HKEY_DATALEN % sizeof(u32) != 0); \
+ \
+ jhash2(__k, __l, initval) & jhash_mask(htable_bits); \
+})
/* The generic hash structure */
struct htype {
struct htable __rcu *table; /* the hash table */
+ struct timer_list gc; /* garbage collection when timeout enabled */
u32 maxelem; /* max elements in the hash */
- u32 elements; /* current element (vs timeout) */
u32 initval; /* random jhash init value */
#ifdef IP_SET_HASH_WITH_MARKMASK
u32 markmask; /* markmask value for mark mask to store */
#endif
- struct timer_list gc; /* garbage collection when timeout enabled */
- struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_MULTI
u8 ahash_max; /* max elements in an array block */
#endif
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
+ struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_NETS
- struct net_prefixes nets[0]; /* book-keeping of prefixes */
+ struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
#endif
};
-#endif /* htype */
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
* sized networks. cidr == real cidr + 1 to support /0.
*/
static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
{
int i, j;
/* Add in increasing prefix order, so larger cidr first */
- for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) {
+ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
if (j != -1) {
continue;
} else if (h->nets[i].cidr[n] < cidr) {
@@ -323,11 +326,11 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
}
static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
{
- u8 i, j, net_end = nets_length - 1;
+ u8 i, j, net_end = NLEN - 1;
- for (i = 0; i < nets_length; i++) {
+ for (i = 0; i < NLEN; i++) {
if (h->nets[i].cidr[n] != cidr)
continue;
h->nets[CIDR_POS(cidr)].nets[n]--;
@@ -343,24 +346,9 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
/* Calculate the actual memory size of the set data */
static size_t
-mtype_ahash_memsize(const struct htype *h, const struct htable *t,
- u8 nets_length, size_t dsize)
+mtype_ahash_memsize(const struct htype *h, const struct htable *t)
{
- u32 i;
- struct hbucket *n;
- size_t memsize = sizeof(*h) + sizeof(*t);
-
-#ifdef IP_SET_HASH_WITH_NETS
- memsize += sizeof(struct net_prefixes) * nets_length;
-#endif
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = rcu_dereference_bh(hbucket(t, i));
- if (!n)
- continue;
- memsize += sizeof(struct hbucket) + n->size * dsize;
- }
-
- return memsize;
+ return sizeof(*h) + sizeof(*t);
}
/* Get the ith element from the array block n */
@@ -398,9 +386,10 @@ mtype_flush(struct ip_set *set)
kfree_rcu(n, rcu);
}
#ifdef IP_SET_HASH_WITH_NETS
- memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
+ memset(h->nets, 0, sizeof(h->nets));
#endif
- h->elements = 0;
+ set->elements = 0;
+ set->ext_size = 0;
}
/* Destroy the hashtable part of the set */
@@ -444,11 +433,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct htype *h = set->data;
- init_timer(&h->gc);
- h->gc.data = (unsigned long)set;
- h->gc.function = gc;
- h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&h->gc);
+ setup_timer(&h->gc, gc, (unsigned long)set);
+ mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
pr_debug("gc initialized, run in every %u\n",
IPSET_GC_PERIOD(set->timeout));
}
@@ -473,12 +459,13 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
/* Delete expired elements from the hashtable */
static void
-mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
+mtype_expire(struct ip_set *set, struct htype *h)
{
struct htable *t;
struct hbucket *n, *tmp;
struct mtype_elem *data;
u32 i, j, d;
+ size_t dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
@@ -494,21 +481,20 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
continue;
}
data = ahash_data(n, j, dsize);
- if (ip_set_timeout_expired(ext_timeout(data, set))) {
- pr_debug("expired %u/%u\n", i, j);
- clear_bit(j, n->used);
- smp_mb__after_atomic();
+ if (!ip_set_timeout_expired(ext_timeout(data, set)))
+ continue;
+ pr_debug("expired %u/%u\n", i, j);
+ clear_bit(j, n->used);
+ smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
- for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h,
- NCIDR_PUT(DCIDR_GET(data->cidr,
- k)),
- nets_length, k);
+ for (k = 0; k < IPSET_NET_COUNT; k++)
+ mtype_del_cidr(h,
+ NCIDR_PUT(DCIDR_GET(data->cidr, k)),
+ k);
#endif
- ip_set_ext_destroy(set, data);
- h->elements--;
- d++;
- }
+ ip_set_ext_destroy(set, data);
+ set->elements--;
+ d++;
}
if (d >= AHASH_INIT_SIZE) {
if (d >= n->size) {
@@ -532,6 +518,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
d++;
}
tmp->pos = d;
+ set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, i), tmp);
kfree_rcu(n, rcu);
}
@@ -546,7 +533,7 @@ mtype_gc(unsigned long ul_set)
pr_debug("called\n");
spin_lock_bh(&set->lock);
- mtype_expire(set, h, NLEN(set->family), set->dsize);
+ mtype_expire(set, h);
spin_unlock_bh(&set->lock);
h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
@@ -563,7 +550,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t dsize = set->dsize;
+ size_t extsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -606,6 +593,7 @@ retry:
/* There can't be another parallel resizing, but dumping is possible */
atomic_set(&orig->ref, 1);
atomic_inc(&orig->uref);
+ extsize = 0;
pr_debug("attempt to resize set %s from %u to %u, t %p\n",
set->name, orig->htable_bits, htable_bits, orig);
for (i = 0; i < jhash_size(orig->htable_bits); i++) {
@@ -636,6 +624,7 @@ retry:
goto cleanup;
}
m->size = AHASH_INIT_SIZE;
+ extsize = ext_size(AHASH_INIT_SIZE, dsize);
RCU_INIT_POINTER(hbucket(t, key), m);
} else if (m->pos >= m->size) {
struct hbucket *ht;
@@ -655,6 +644,7 @@ retry:
memcpy(ht, m, sizeof(struct hbucket) +
m->size * dsize);
ht->size = m->size + AHASH_INIT_SIZE;
+ extsize += ext_size(AHASH_INIT_SIZE, dsize);
kfree(m);
m = ht;
RCU_INIT_POINTER(hbucket(t, key), ht);
@@ -668,6 +658,7 @@ retry:
}
}
rcu_assign_pointer(h->table, t);
+ set->ext_size = extsize;
spin_unlock_bh(&set->lock);
@@ -715,11 +706,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
bool deleted = false, forceadd = false, reuse = false;
u32 key, multi = 0;
- if (h->elements >= h->maxelem) {
+ if (set->elements >= h->maxelem) {
if (SET_WITH_TIMEOUT(set))
/* FIXME: when set is full, we slow down here */
- mtype_expire(set, h, NLEN(set->family), set->dsize);
- if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ mtype_expire(set, h);
+ if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
forceadd = true;
}
@@ -727,20 +718,15 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
key = HKEY(value, h->initval, t->htable_bits);
n = __ipset_dereference_protected(hbucket(t, key), 1);
if (!n) {
- if (forceadd) {
- if (net_ratelimit())
- pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- return -IPSET_ERR_HASH_FULL;
- } else if (h->elements >= h->maxelem) {
+ if (forceadd || set->elements >= h->maxelem)
goto set_full;
- }
old = NULL;
n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
GFP_ATOMIC);
if (!n)
return -ENOMEM;
n->size = AHASH_INIT_SIZE;
+ set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
goto copy_elem;
}
for (i = 0; i < n->pos; i++) {
@@ -778,14 +764,14 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
for (i = 0; i < IPSET_NET_COUNT; i++)
mtype_del_cidr(h,
NCIDR_PUT(DCIDR_GET(data->cidr, i)),
- NLEN(set->family), i);
+ i);
#endif
ip_set_ext_destroy(set, data);
- h->elements--;
+ set->elements--;
}
goto copy_data;
}
- if (h->elements >= h->maxelem)
+ if (set->elements >= h->maxelem)
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
@@ -804,17 +790,17 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
memcpy(n, old, sizeof(struct hbucket) +
old->size * set->dsize);
n->size = old->size + AHASH_INIT_SIZE;
+ set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
}
copy_elem:
j = n->pos++;
data = ahash_data(n, j, set->dsize);
copy_data:
- h->elements++;
+ set->elements++;
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
- NLEN(set->family), i);
+ mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
#endif
memcpy(data, d, sizeof(struct mtype_elem));
overwrite_extensions:
@@ -824,7 +810,7 @@ overwrite_extensions:
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(data, set), ext);
if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(data, set), ext);
+ ip_set_init_comment(set, ext_comment(data, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
/* Must come last for the case when timed out entry is reused */
@@ -883,11 +869,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
smp_mb__after_atomic();
if (i + 1 == n->pos)
n->pos--;
- h->elements--;
+ set->elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
- NLEN(set->family), j);
+ j);
#endif
ip_set_ext_destroy(set, data);
@@ -896,6 +882,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
k++;
}
if (n->pos == 0 && k == 0) {
+ set->ext_size -= ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, key), NULL);
kfree_rcu(n, rcu);
} else if (k >= AHASH_INIT_SIZE) {
@@ -914,6 +901,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
k++;
}
tmp->pos = k;
+ set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, key), tmp);
kfree_rcu(n, rcu);
}
@@ -957,14 +945,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
int i, j = 0;
#endif
u32 key, multi = 0;
- u8 nets_length = NLEN(set->family);
pr_debug("test by nets\n");
- for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) {
+ for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) {
#if IPSET_NET_COUNT == 2
mtype_data_reset_elem(d, &orig);
mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
- for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi;
+ for (k = 0; k < NLEN && h->nets[k].cidr[1] && !multi;
k++) {
mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
true);
@@ -1021,7 +1008,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
* try all possible network sizes
*/
for (i = 0; i < IPSET_NET_COUNT; i++)
- if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family))
+ if (DCIDR_GET(d->cidr, i) != HOST_MASK)
break;
if (i == IPSET_NET_COUNT) {
ret = mtype_test_cidrs(set, d, ext, mext, flags);
@@ -1062,7 +1049,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
rcu_read_lock_bh();
t = rcu_dereference_bh_nfnl(h->table);
- memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+ memsize = mtype_ahash_memsize(h, t) + set->ext_size;
htable_bits = t->htable_bits;
rcu_read_unlock_bh();
@@ -1083,7 +1070,8 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
goto nla_put_failure;
#endif
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -1238,41 +1226,35 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
struct htype *h;
struct htable *t;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+
#ifndef IP_SET_PROTO_UNDEF
if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
return -IPSET_ERR_INVALID_FAMILY;
#endif
-#ifdef IP_SET_HASH_WITH_MARKMASK
- markmask = 0xffffffff;
-#endif
-#ifdef IP_SET_HASH_WITH_NETMASK
- netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
- pr_debug("Create set %s with family %s\n",
- set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
-#endif
-
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
+
#ifdef IP_SET_HASH_WITH_MARKMASK
/* Separated condition in order to avoid directive in argument list */
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK)))
return -IPSET_ERR_PROTOCOL;
-#endif
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
+ markmask = 0xffffffff;
+ if (tb[IPSET_ATTR_MARKMASK]) {
+ markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
+ if (markmask == 0)
+ return -IPSET_ERR_INVALID_MARKMASK;
}
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+#endif
#ifdef IP_SET_HASH_WITH_NETMASK
+ netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
@@ -1282,33 +1264,21 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
return -IPSET_ERR_INVALID_NETMASK;
}
#endif
-#ifdef IP_SET_HASH_WITH_MARKMASK
- if (tb[IPSET_ATTR_MARKMASK]) {
- markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
- if (markmask == 0)
- return -IPSET_ERR_INVALID_MARKMASK;
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
}
-#endif
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
hsize = sizeof(*h);
-#ifdef IP_SET_HASH_WITH_NETS
- hsize += sizeof(struct net_prefixes) * NLEN(set->family);
-#endif
h = kzalloc(hsize, GFP_KERNEL);
if (!h)
return -ENOMEM;
- h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
- h->netmask = netmask;
-#endif
-#ifdef IP_SET_HASH_WITH_MARKMASK
- h->markmask = markmask;
-#endif
- get_random_bytes(&h->initval, sizeof(h->initval));
- set->timeout = IPSET_NO_TIMEOUT;
-
hbits = htable_bits(hashsize);
hsize = htable_size(hbits);
if (hsize == 0) {
@@ -1320,8 +1290,17 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
kfree(h);
return -ENOMEM;
}
+ h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ h->netmask = netmask;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ h->markmask = markmask;
+#endif
+ get_random_bytes(&h->initval, sizeof(h->initval));
+
t->htable_bits = hbits;
- rcu_assign_pointer(h->table, t);
+ RCU_INIT_POINTER(h->table, t);
set->data = h;
#ifndef IP_SET_PROTO_UNDEF
@@ -1339,6 +1318,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
__alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
}
#endif
+ set->timeout = IPSET_NO_TIMEOUT;
if (tb[IPSET_ATTR_TIMEOUT]) {
set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
#ifndef IP_SET_PROTO_UNDEF
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 9d6bf19f7b78..20bfbd315f61 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -82,7 +82,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip4_elem e = { 0 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -101,7 +101,7 @@ static int
hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip4_elem e = { 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -199,7 +199,7 @@ nla_put_failure:
}
static inline void
-hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
+hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e)
{
}
@@ -217,7 +217,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip6_elem e = { { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -234,7 +234,7 @@ static int
hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip6_elem e = { { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
new file mode 100644
index 000000000000..1ab5ed2f6839
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -0,0 +1,315 @@
+/* Copyright (C) 2016 Tomasz Chilinski <tomasz.chilinski@chilan.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,mac type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 0
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
+IP_SET_MODULE_DESC("hash:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,mac");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipmac
+
+/* Zero valued element is not supported */
+static const unsigned char invalid_ether[ETH_ALEN] = { 0 };
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipmac4_elem {
+ /* Zero valued IP addresses cannot be stored */
+ __be32 ip;
+ union {
+ unsigned char ether[ETH_ALEN];
+ __be32 foo[2];
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1,
+ const struct hash_ipmac4_elem *e2,
+ u32 *multi)
+{
+ return e1->ip == e2->ip && ether_addr_equal(e1->ether, e2->ether);
+}
+
+static bool
+hash_ipmac4_data_list(struct sk_buff *skb, const struct hash_ipmac4_elem *e)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip) ||
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_ipmac4_data_next(struct hash_ipmac4_elem *next,
+ const struct hash_ipmac4_elem *e)
+{
+ next->ip = e->ip;
+}
+
+#define MTYPE hash_ipmac4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipmac4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_ETHER] ||
+ nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -IPSET_ERR_HASH_ELEM;
+
+ return adtfn(set, &e, &ext, &ext, flags);
+}
+
+/* IPv6 variant */
+
+/* Member elements */
+struct hash_ipmac6_elem {
+ /* Zero valued IP addresses cannot be stored */
+ union nf_inet_addr ip;
+ union {
+ unsigned char ether[ETH_ALEN];
+ __be32 foo[2];
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1,
+ const struct hash_ipmac6_elem *e2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&e1->ip.in6, &e2->ip.in6) &&
+ ether_addr_equal(e1->ether, e2->ether);
+}
+
+static bool
+hash_ipmac6_data_list(struct sk_buff *skb, const struct hash_ipmac6_elem *e)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_ipmac6_data_next(struct hash_ipmac6_elem *next,
+ const struct hash_ipmac6_elem *e)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipmac6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipmac6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac6_elem e = {
+ { .all = { 0 } },
+ { .foo[0] = 0, .foo[1] = 0 }
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac6_elem e = {
+ { .all = { 0 } },
+ { .foo[0] = 0, .foo[1] = 0 }
+ };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_ETHER] ||
+ nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -IPSET_ERR_HASH_ELEM;
+
+ return adtfn(set, &e, &ext, &ext, flags);
+}
+
+static struct ip_set_type hash_ipmac_type __read_mostly = {
+ .name = "hash:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipmac_init(void)
+{
+ return ip_set_type_register(&hash_ipmac_type);
+}
+
+static void __exit
+hash_ipmac_fini(void)
+{
+ ip_set_type_unregister(&hash_ipmac_type);
+}
+
+module_init(hash_ipmac_init);
+module_exit(hash_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index a0695a2ab585..b64cf14e8352 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -85,7 +85,7 @@ hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -101,7 +101,7 @@ static int
hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -193,7 +193,7 @@ nla_put_failure:
}
static inline void
-hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
+hash_ipmark6_data_next(struct hash_ipmark6_elem *next,
const struct hash_ipmark6_elem *d)
{
}
@@ -211,7 +211,7 @@ hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -227,7 +227,7 @@ static int
hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 9d84b3dff603..f438740e6c6a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -108,7 +108,7 @@ static int
hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipport *h = set->data;
+ const struct hash_ipport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -231,7 +231,7 @@ nla_put_failure:
}
static inline void
-hash_ipport6_data_next(struct hash_ipport4_elem *next,
+hash_ipport6_data_next(struct hash_ipport6_elem *next,
const struct hash_ipport6_elem *d)
{
next->port = d->port;
@@ -266,7 +266,7 @@ static int
hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipport *h = set->data;
+ const struct hash_ipport6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 215b7b942038..6215fb898c50 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -111,7 +111,7 @@ static int
hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportip *h = set->data;
+ const struct hash_ipportip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportip4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -241,7 +241,7 @@ nla_put_failure:
}
static inline void
-hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
+hash_ipportip6_data_next(struct hash_ipportip6_elem *next,
const struct hash_ipportip6_elem *d)
{
next->port = d->port;
@@ -277,7 +277,7 @@ static int
hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportip *h = set->data;
+ const struct hash_ipportip6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 9ca719625ea3..5ab1b99a53c2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -138,7 +138,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -163,7 +163,7 @@ static int
hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -370,7 +370,7 @@ nla_put_failure:
}
static inline void
-hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
+hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next,
const struct hash_ipportnet6_elem *d)
{
next->port = d->port;
@@ -389,7 +389,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -414,7 +414,7 @@ static int
hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 3e4bffdc1cc0..5d9e895452e7 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -117,7 +117,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_net *h = set->data;
+ const struct hash_net4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -139,7 +139,7 @@ static int
hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_net *h = set->data;
+ const struct hash_net4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = { .cidr = HOST_MASK };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -268,7 +268,7 @@ nla_put_failure:
}
static inline void
-hash_net6_data_next(struct hash_net4_elem *next,
+hash_net6_data_next(struct hash_net6_elem *next,
const struct hash_net6_elem *d)
{
}
@@ -286,7 +286,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_net *h = set->data;
+ const struct hash_net6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f0f688db6213..44cf11939c91 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -156,7 +156,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- struct hash_netiface *h = set->data;
+ struct hash_netiface4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -170,7 +170,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
e.ip &= ip_set_netmask(e.cidr);
-#define IFACE(dir) (par->dir ? par->dir->name : "")
+#define IFACE(dir) (par->state->dir ? par->state->dir->name : "")
#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
@@ -196,7 +196,7 @@ static int
hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- struct hash_netiface *h = set->data;
+ struct hash_netiface4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -348,7 +348,7 @@ nla_put_failure:
}
static inline void
-hash_netiface6_data_next(struct hash_netiface4_elem *next,
+hash_netiface6_data_next(struct hash_netiface6_elem *next,
const struct hash_netiface6_elem *d)
{
}
@@ -367,7 +367,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- struct hash_netiface *h = set->data;
+ struct hash_netiface6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index a93dfebffa81..db614e13b193 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -143,7 +143,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netnet *h = set->data;
+ const struct hash_netnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -165,7 +165,7 @@ static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netnet *h = set->data;
+ const struct hash_netnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -352,7 +352,7 @@ nla_put_failure:
}
static inline void
-hash_netnet6_data_next(struct hash_netnet4_elem *next,
+hash_netnet6_data_next(struct hash_netnet6_elem *next,
const struct hash_netnet6_elem *d)
{
}
@@ -377,7 +377,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netnet *h = set->data;
+ const struct hash_netnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 731813e0f08c..54b64b6cd0cd 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -133,7 +133,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -157,7 +157,7 @@ static int
hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -329,7 +329,7 @@ nla_put_failure:
}
static inline void
-hash_netport6_data_next(struct hash_netport4_elem *next,
+hash_netport6_data_next(struct hash_netport6_elem *next,
const struct hash_netport6_elem *d)
{
next->port = d->port;
@@ -348,7 +348,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -372,7 +372,7 @@ static int
hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 9a14c237830f..aff846960ac4 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -154,7 +154,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -180,7 +180,7 @@ static int
hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -406,7 +406,7 @@ nla_put_failure:
}
static inline void
-hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
+hash_netportnet6_data_next(struct hash_netportnet6_elem *next,
const struct hash_netportnet6_elem *d)
{
next->port = d->port;
@@ -432,7 +432,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -458,7 +458,7 @@ static int
hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index a2a89e4e0a14..51077c53d76b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -166,6 +166,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
static inline void
list_set_del(struct ip_set *set, struct set_elem *e)
{
+ set->elements--;
list_del_rcu(&e->list);
call_rcu(&e->rcu, __list_set_del_rcu);
}
@@ -227,7 +228,7 @@ list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(e, set), ext);
if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(e, set), ext);
+ ip_set_init_comment(set, ext_comment(e, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
/* Update timeout last */
@@ -309,6 +310,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
list_add_rcu(&e->list, &prev->list);
else
list_add_tail_rcu(&e->list, &map->members);
+ set->elements++;
return 0;
}
@@ -419,6 +421,8 @@ list_set_flush(struct ip_set *set)
list_for_each_entry_safe(e, n, &map->members, list)
list_set_del(set, e);
+ set->elements = 0;
+ set->ext_size = 0;
}
static void
@@ -441,12 +445,12 @@ list_set_destroy(struct ip_set *set)
set->data = NULL;
}
-static int
-list_set_head(struct ip_set *set, struct sk_buff *skb)
+/* Calculate the actual memory size of the set data */
+static size_t
+list_set_memsize(const struct list_set *map, size_t dsize)
{
- const struct list_set *map = set->data;
- struct nlattr *nested;
struct set_elem *e;
+ size_t memsize;
u32 n = 0;
rcu_read_lock();
@@ -454,13 +458,25 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
n++;
rcu_read_unlock();
+ memsize = sizeof(*map) + n * dsize;
+
+ return memsize;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+ size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size;
+
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) + n * set->dsize)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -570,11 +586,8 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct list_set *map = set->data;
- init_timer(&map->gc);
- map->gc.data = (unsigned long)set;
- map->gc.function = gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&map->gc);
+ setup_timer(&map->gc, gc, (unsigned long)set);
+ mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
}
/* Create list:set type of sets */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2c1b498a7a27..db40050f8785 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
-static int ip_vs_net_id __read_mostly;
+static unsigned int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c3c809b2e712..55e0169caa4c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -48,7 +48,7 @@
#include <net/sock.h>
#include <net/genetlink.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/ip_vs.h>
@@ -2840,14 +2840,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
*/
/* IPVS genetlink family */
-static struct genl_family ip_vs_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = IPVS_GENL_NAME,
- .version = IPVS_GENL_VERSION,
- .maxattr = IPVS_CMD_MAX,
- .netnsok = true, /* Make ipvsadm to work on netns */
-};
+static struct genl_family ip_vs_genl_family;
/* Policy used for first-level command attributes */
static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
@@ -3267,7 +3260,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
- if (IS_ERR(svc) || svc == NULL)
+ if (IS_ERR_OR_NULL(svc))
goto out_err;
/* Dump the destinations */
@@ -3872,10 +3865,20 @@ static const struct genl_ops ip_vs_genl_ops[] = {
},
};
+static struct genl_family ip_vs_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = IPVS_GENL_NAME,
+ .version = IPVS_GENL_VERSION,
+ .maxattr = IPVS_CMD_ATTR_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
+ .module = THIS_MODULE,
+ .ops = ip_vs_genl_ops,
+ .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
+};
+
static int __init ip_vs_genl_register(void)
{
- return genl_register_family_with_ops(&ip_vs_genl_family,
- ip_vs_genl_ops);
+ return genl_register_family(&ip_vs_genl_family);
}
static void ip_vs_genl_unregister(void)
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index f04fd8df210b..fc230d99aa3b 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -281,13 +281,10 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
- /* Show what happens instead of calling nf_ct_kill() */
- if (del_timer(&ct->timeout)) {
- IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ if (nf_ct_kill(ct)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
FMT_TUPLE "\n",
__func__, ct, ARG_TUPLE(&tuple));
- if (ct->timeout.function)
- ct->timeout.function(ct->timeout.data);
} else {
IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
FMT_TUPLE "\n",
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 1b07578bedf3..9350530c16c1 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -283,6 +283,7 @@ struct ip_vs_sync_buff {
*/
static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
{
+ memset(ho, 0, sizeof(*ho));
ho->init_seq = get_unaligned_be32(&no->init_seq);
ho->delta = get_unaligned_be32(&no->delta);
ho->previous_delta = get_unaligned_be32(&no->previous_delta);
@@ -917,8 +918,10 @@ static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *pa
kfree(param->pe_data);
}
- if (opt)
- memcpy(&cp->in_seq, opt, sizeof(*opt));
+ if (opt) {
+ cp->in_seq = opt->in_seq;
+ cp->out_seq = opt->out_seq;
+ }
atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
cp->state = state;
cp->old_state = cp->state;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 01d3d894de46..4e1a98fcc8c3 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -254,6 +254,54 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
return true;
}
+static inline bool decrement_ttl(struct netns_ipvs *ipvs,
+ int skb_af,
+ struct sk_buff *skb)
+{
+ struct net *net = ipvs->net;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ /* check and decrement ttl */
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ /* Force OUTPUT device used as source address */
+ skb->dev = dst->dev;
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
+
+ return false;
+ }
+
+ /* don't propagate ttl change to cloned packets */
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ return false;
+
+ ipv6_hdr(skb)->hop_limit--;
+ } else
+#endif
+ {
+ if (ip_hdr(skb)->ttl <= 1) {
+ /* Tell the sender its packet died... */
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ return false;
+ }
+
+ /* don't propagate ttl change to cloned packets */
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return false;
+
+ /* Decrease ttl */
+ ip_decrease_ttl(ip_hdr(skb));
+ }
+
+ return true;
+}
+
/* Get route to destination or remote server */
static int
__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
@@ -326,6 +374,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
return local;
}
+ if (!decrement_ttl(ipvs, skb_af, skb))
+ goto err_put;
+
if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
mtu = dst_mtu(&rt->dst);
} else {
@@ -473,6 +524,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
return local;
}
+ if (!decrement_ttl(ipvs, skb_af, skb))
+ goto err_put;
+
/* MTU checking */
if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
mtu = dst_mtu(&rt->dst);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9934b0c93c1e..4e8083c5e01d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -72,12 +72,27 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_hash);
+struct conntrack_gc_work {
+ struct delayed_work dwork;
+ u32 last_bucket;
+ bool exiting;
+ long next_gc_run;
+};
+
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
-static __read_mostly seqcount_t nf_conntrack_generation;
static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
+/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
+#define GC_MAX_BUCKETS_DIV 128u
+/* upper bound of full table scan */
+#define GC_MAX_SCAN_JIFFIES (16u * HZ)
+/* desired ratio of entries found to be expired */
+#define GC_EVICT_RATIO 50u
+
+static struct conntrack_gc_work conntrack_gc_work;
+
void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
{
spin_lock(lock);
@@ -164,7 +179,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
+seqcount_t nf_conntrack_generation __read_mostly;
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
@@ -367,12 +382,10 @@ static void
destroy_conntrack(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- struct net *net = nf_ct_net(ct);
struct nf_conntrack_l4proto *l4proto;
pr_debug("destroy_conntrack(%p)\n", ct);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
- NF_CT_ASSERT(!timer_pending(&ct->timeout));
if (unlikely(nf_ct_is_template(ct))) {
nf_ct_tmpl_free(ct);
@@ -395,7 +408,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- NF_CT_STAT_INC(net, delete);
local_bh_enable();
if (ct->master)
@@ -427,7 +439,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
nf_ct_add_to_dying_list(ct);
- NF_CT_STAT_INC(net, delete_list);
local_bh_enable();
}
@@ -435,35 +446,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
+ if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
+ return false;
+
tstamp = nf_conn_tstamp_find(ct);
if (tstamp && tstamp->stop == 0)
tstamp->stop = ktime_get_real_ns();
- if (nf_ct_is_dying(ct))
- goto delete;
-
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
- /* destroy event was not delivered */
+ /* destroy event was not delivered. nf_ct_put will
+ * be done by event cache worker on redelivery.
+ */
nf_ct_delete_from_lists(ct);
nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
return false;
}
nf_conntrack_ecache_work(nf_ct_net(ct));
- set_bit(IPS_DYING_BIT, &ct->status);
- delete:
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
return true;
}
EXPORT_SYMBOL_GPL(nf_ct_delete);
-static void death_by_timeout(unsigned long ul_conntrack)
-{
- nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
-}
-
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
const struct nf_conntrack_tuple *tuple,
@@ -481,22 +487,17 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
net_eq(net, nf_ct_net(ct));
}
-/* must be called with rcu read lock held */
-void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
+/* caller must hold rcu readlock and none of the nf_conntrack_locks */
+static void nf_ct_gc_expired(struct nf_conn *ct)
{
- struct hlist_nulls_head *hptr;
- unsigned int sequence, hsz;
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
+ return;
- do {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
- hsz = nf_conntrack_htable_size;
- hptr = nf_conntrack_hash;
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+ if (nf_ct_should_gc(ct))
+ nf_ct_kill(ct);
- *hash = hptr;
- *hsize = hsz;
+ nf_ct_put(ct);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
/*
* Warning :
@@ -510,21 +511,26 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int bucket, sequence;
+ unsigned int bucket, hsize;
begin:
- do {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
- bucket = scale_hash(hash);
- ct_hash = nf_conntrack_hash;
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+ nf_conntrack_get_ht(&ct_hash, &hsize);
+ bucket = reciprocal_scale(hash, hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
- if (nf_ct_key_equal(h, tuple, zone, net)) {
- NF_CT_STAT_INC_ATOMIC(net, found);
- return h;
+ struct nf_conn *ct;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (nf_ct_is_expired(ct)) {
+ nf_ct_gc_expired(ct);
+ continue;
}
- NF_CT_STAT_INC_ATOMIC(net, searched);
+
+ if (nf_ct_is_dying(ct))
+ continue;
+
+ if (nf_ct_key_equal(h, tuple, zone, net))
+ return h;
}
/*
* if the nulls value we got at the end of this lookup is
@@ -618,7 +624,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
zone, net))
goto out;
- add_timer(&ct->timeout);
smp_wmb();
/* The caller holds a reference to this object */
atomic_set(&ct->ct_general.use, 2);
@@ -771,15 +776,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
- ct->timeout.expires += jiffies;
- add_timer(&ct->timeout);
+ ct->timeout += nfct_time_stamp;
atomic_inc(&ct->ct_general.use);
ct->status |= IPS_CONFIRMED;
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
if (tstamp) {
- if (skb->tstamp.tv64 == 0)
+ if (skb->tstamp == 0)
__net_timestamp(skb);
tstamp->start = ktime_to_ns(skb->tstamp);
@@ -791,7 +795,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert);
local_bh_enable();
help = nfct_help(ct);
@@ -823,29 +826,40 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
- unsigned int hash, sequence;
+ unsigned int hash, hsize;
struct hlist_nulls_node *n;
struct nf_conn *ct;
zone = nf_ct_zone(ignored_conntrack);
rcu_read_lock();
- do {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
- hash = hash_conntrack(net, tuple);
- ct_hash = nf_conntrack_hash;
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+ begin:
+ nf_conntrack_get_ht(&ct_hash, &hsize);
+ hash = __hash_conntrack(net, tuple, hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
- if (ct != ignored_conntrack &&
- nf_ct_key_equal(h, tuple, zone, net)) {
+
+ if (ct == ignored_conntrack)
+ continue;
+
+ if (nf_ct_is_expired(ct)) {
+ nf_ct_gc_expired(ct);
+ continue;
+ }
+
+ if (nf_ct_key_equal(h, tuple, zone, net)) {
NF_CT_STAT_INC_ATOMIC(net, found);
rcu_read_unlock();
return 1;
}
- NF_CT_STAT_INC_ATOMIC(net, searched);
}
+
+ if (get_nulls_value(n) != hash) {
+ NF_CT_STAT_INC_ATOMIC(net, search_restart);
+ goto begin;
+ }
+
rcu_read_unlock();
return 0;
@@ -867,6 +881,11 @@ static unsigned int early_drop_list(struct net *net,
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
+ if (nf_ct_is_expired(tmp)) {
+ nf_ct_gc_expired(tmp);
+ continue;
+ }
+
if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
!net_eq(nf_ct_net(tmp), net) ||
nf_ct_is_dying(tmp))
@@ -884,7 +903,6 @@ static unsigned int early_drop_list(struct net *net,
*/
if (net_eq(nf_ct_net(tmp), net) &&
nf_ct_is_confirmed(tmp) &&
- del_timer(&tmp->timeout) &&
nf_ct_delete(tmp, 0, 0))
drops++;
@@ -900,14 +918,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
struct hlist_nulls_head *ct_hash;
- unsigned hash, sequence, drops;
+ unsigned int hash, hsize, drops;
rcu_read_lock();
- do {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
- hash = scale_hash(_hash++);
- ct_hash = nf_conntrack_hash;
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+ nf_conntrack_get_ht(&ct_hash, &hsize);
+ hash = reciprocal_scale(_hash++, hsize);
drops = early_drop_list(net, &ct_hash[hash]);
rcu_read_unlock();
@@ -921,6 +936,97 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
return false;
}
+static void gc_worker(struct work_struct *work)
+{
+ unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
+ unsigned int i, goal, buckets = 0, expired_count = 0;
+ struct conntrack_gc_work *gc_work;
+ unsigned int ratio, scanned = 0;
+ unsigned long next_run;
+
+ gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
+
+ goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
+ i = gc_work->last_bucket;
+
+ do {
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_head *ct_hash;
+ struct hlist_nulls_node *n;
+ unsigned int hashsz;
+ struct nf_conn *tmp;
+
+ i++;
+ rcu_read_lock();
+
+ nf_conntrack_get_ht(&ct_hash, &hashsz);
+ if (i >= hashsz)
+ i = 0;
+
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ tmp = nf_ct_tuplehash_to_ctrack(h);
+
+ scanned++;
+ if (nf_ct_is_expired(tmp)) {
+ nf_ct_gc_expired(tmp);
+ expired_count++;
+ continue;
+ }
+ }
+
+ /* could check get_nulls_value() here and restart if ct
+ * was moved to another chain. But given gc is best-effort
+ * we will just continue with next hash slot.
+ */
+ rcu_read_unlock();
+ cond_resched_rcu_qs();
+ } while (++buckets < goal);
+
+ if (gc_work->exiting)
+ return;
+
+ /*
+ * Eviction will normally happen from the packet path, and not
+ * from this gc worker.
+ *
+ * This worker is only here to reap expired entries when system went
+ * idle after a busy period.
+ *
+ * The heuristics below are supposed to balance conflicting goals:
+ *
+ * 1. Minimize time until we notice a stale entry
+ * 2. Maximize scan intervals to not waste cycles
+ *
+ * Normally, expire ratio will be close to 0.
+ *
+ * As soon as a sizeable fraction of the entries have expired
+ * increase scan frequency.
+ */
+ ratio = scanned ? expired_count * 100 / scanned : 0;
+ if (ratio > GC_EVICT_RATIO) {
+ gc_work->next_gc_run = min_interval;
+ } else {
+ unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
+
+ BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
+
+ gc_work->next_gc_run += min_interval;
+ if (gc_work->next_gc_run > max)
+ gc_work->next_gc_run = max;
+ }
+
+ next_run = gc_work->next_gc_run;
+ gc_work->last_bucket = i;
+ queue_delayed_work(system_long_wq, &gc_work->dwork, next_run);
+}
+
+static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
+{
+ INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
+ gc_work->next_gc_run = HZ;
+ gc_work->exiting = false;
+}
+
static struct nf_conn *
__nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
@@ -957,8 +1063,6 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
- /* Don't set timer yet: wait for confirmation */
- setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
@@ -1096,10 +1200,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
spin_unlock(&nf_conntrack_expect_lock);
}
- if (!exp) {
+ if (!exp)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- NF_CT_STAT_INC(net, new);
- }
/* Now it is inserted into the unconfirmed list, bump refcount */
nf_conntrack_get(&ct->ct_general);
@@ -1204,7 +1306,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
skb->nfct = NULL;
}
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
l3proto = __nf_ct_l3proto_find(pf);
ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
&dataoff, &protonum);
@@ -1234,7 +1336,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
if (skb->nfct)
goto out;
}
-
+repeat:
ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
l3proto, l4proto, &set_reply, &ctinfo);
if (!ct) {
@@ -1266,6 +1368,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
NF_CT_STAT_INC_ATOMIC(net, invalid);
if (ret == -NF_DROP)
NF_CT_STAT_INC_ATOMIC(net, drop);
+ /* Special case: TCP tracker reports an attempt to reopen a
+ * closed/aborted connection. We have to go back and create a
+ * fresh conntrack.
+ */
+ if (ret == -NF_REPEAT)
+ goto repeat;
ret = -ret;
goto out;
}
@@ -1273,15 +1381,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
- if (tmpl) {
- /* Special case: we have to repeat this hook, assign the
- * template again to this packet. We assume that this packet
- * has no conntrack assigned. This is used by nf_ct_tcp. */
- if (ret == NF_REPEAT)
- skb->nfct = (struct nf_conntrack *)tmpl;
- else
- nf_ct_put(tmpl);
- }
+ if (tmpl)
+ nf_ct_put(tmpl);
return ret;
}
@@ -1332,7 +1433,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
unsigned long extra_jiffies,
int do_acct)
{
- NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
NF_CT_ASSERT(skb);
/* Only update if this is not a fixed timeout */
@@ -1340,39 +1440,25 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
goto acct;
/* If not in hash table, timer will not be active yet */
- if (!nf_ct_is_confirmed(ct)) {
- ct->timeout.expires = extra_jiffies;
- } else {
- unsigned long newtime = jiffies + extra_jiffies;
-
- /* Only update the timeout if the new timeout is at least
- HZ jiffies from the old timeout. Need del_timer for race
- avoidance (may already be dying). */
- if (newtime - ct->timeout.expires >= HZ)
- mod_timer_pending(&ct->timeout, newtime);
- }
+ if (nf_ct_is_confirmed(ct))
+ extra_jiffies += nfct_time_stamp;
+ ct->timeout = extra_jiffies;
acct:
if (do_acct)
nf_ct_acct_update(ct, ctinfo, skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
-bool __nf_ct_kill_acct(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
- int do_acct)
+bool nf_ct_kill_acct(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb)
{
- if (do_acct)
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, ctinfo, skb->len);
- if (del_timer(&ct->timeout)) {
- ct->timeout.function((unsigned long)ct);
- return true;
- }
- return false;
+ return nf_ct_delete(ct, 0, 0);
}
-EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
+EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -1505,11 +1591,8 @@ void nf_ct_iterate_cleanup(struct net *net,
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
- if (del_timer(&ct->timeout))
- nf_ct_delete(ct, portid, report);
-
- /* ... else the timer will get him soon. */
+ nf_ct_delete(ct, portid, report);
nf_ct_put(ct);
cond_resched();
}
@@ -1545,6 +1628,7 @@ static int untrack_refs(void)
void nf_conntrack_cleanup_start(void)
{
+ conntrack_gc_work.exiting = true;
RCU_INIT_POINTER(ip_ct_attach, NULL);
}
@@ -1554,6 +1638,7 @@ void nf_conntrack_cleanup_end(void)
while (untrack_refs() > 0)
schedule();
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
nf_conntrack_proto_fini();
@@ -1828,6 +1913,10 @@ int nf_conntrack_init_start(void)
}
/* - and look it like as a confirmed connection */
nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
+
+ conntrack_gc_work_init(&conntrack_gc_work);
+ queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ);
+
return 0;
err_proto:
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index d28011b42845..da9df2d56e66 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -49,8 +49,13 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct nf_conntrack_ecache *e;
- if (nf_ct_is_dying(ct))
+ if (!nf_ct_is_confirmed(ct))
+ continue;
+
+ e = nf_ct_ecache_find(ct);
+ if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
continue;
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
@@ -58,8 +63,7 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
break;
}
- /* we've got the event delivered, now it's dying */
- set_bit(IPS_DYING_BIT, &ct->status);
+ e->state = NFCT_ECACHE_DESTROY_SENT;
refs[evicted] = ct;
if (++evicted >= ARRAY_SIZE(refs)) {
@@ -130,7 +134,7 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
if (!e)
goto out_unlock;
- if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
+ if (nf_ct_is_confirmed(ct)) {
struct nf_ct_event item = {
.ct = ct,
.portid = e->portid ? e->portid : portid,
@@ -150,11 +154,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
* triggered by a process, we store the PORTID
* to include it in the retransmission.
*/
- if (eventmask & (1 << IPCT_DESTROY) &&
- e->portid == 0 && portid != 0)
- e->portid = portid;
- else
+ if (eventmask & (1 << IPCT_DESTROY)) {
+ if (e->portid == 0 && portid != 0)
+ e->portid = portid;
+ e->state = NFCT_ECACHE_DESTROY_FAIL;
+ } else {
e->missed |= eventmask;
+ }
} else {
e->missed &= ~missed;
}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 43147005bea3..e3ed20060878 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -237,7 +237,7 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
}
delim = data[0];
if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
- pr_debug("try_eprt: invalid delimitter.\n");
+ pr_debug("try_eprt: invalid delimiter.\n");
return 0;
}
@@ -301,8 +301,6 @@ static int find_pattern(const char *data, size_t dlen,
size_t i = plen;
pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
- if (dlen == 0)
- return 0;
if (dlen <= plen) {
/* Short packet: try for partial? */
@@ -311,19 +309,8 @@ static int find_pattern(const char *data, size_t dlen,
else return 0;
}
- if (strncasecmp(data, pattern, plen) != 0) {
-#if 0
- size_t i;
-
- pr_debug("ftp: string mismatch\n");
- for (i = 0; i < plen; i++) {
- pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
- i, data[i], data[i],
- pattern[i], pattern[i]);
- }
-#endif
+ if (strncasecmp(data, pattern, plen) != 0)
return 0;
- }
pr_debug("Pattern matches!\n");
/* Now we've found the constant string, try to skip
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5c0db5c64734..f65d93639d12 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -736,7 +736,7 @@ static int callforward_do_filter(struct net *net,
const struct nf_afinfo *afinfo;
int ret = 0;
- /* rcu_read_lock()ed by nf_hook_slow() */
+ /* rcu_read_lock()ed by nf_hook_thresh */
afinfo = nf_get_afinfo(family);
if (!afinfo)
return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index b989b81ac156..7341adf7059d 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -138,9 +138,14 @@ __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
for (i = 0; i < nf_ct_helper_hsize; i++) {
hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) {
- if (!strcmp(h->name, name) &&
- h->tuple.src.l3num == l3num &&
- h->tuple.dst.protonum == protonum)
+ if (strcmp(h->name, name))
+ continue;
+
+ if (h->tuple.src.l3num != NFPROTO_UNSPEC &&
+ h->tuple.src.l3num != l3num)
+ continue;
+
+ if (h->tuple.dst.protonum == protonum)
return h;
}
}
@@ -189,7 +194,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
struct net *net = nf_ct_net(ct);
- int ret = 0;
/* We already got a helper explicitly attached. The function
* nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -223,15 +227,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
if (helper == NULL) {
if (help)
RCU_INIT_POINTER(help->helper, NULL);
- goto out;
+ return 0;
}
if (help == NULL) {
help = nf_ct_helper_ext_add(ct, helper, flags);
- if (help == NULL) {
- ret = -ENOMEM;
- goto out;
- }
+ if (help == NULL)
+ return -ENOMEM;
} else {
/* We only allow helper re-assignment of the same sort since
* we cannot reallocate the helper extension area.
@@ -240,13 +242,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
if (tmp && tmp->help != helper->help) {
RCU_INIT_POINTER(help->helper, NULL);
- goto out;
+ return 0;
}
}
rcu_assign_pointer(help->helper, helper);
-out:
- return ret;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
@@ -349,7 +351,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
/* Called from the helper function, this call never fails */
help = nfct_help(ct);
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index fdfc71f416b7..27540455dc62 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -149,10 +149,7 @@ nla_put_failure:
static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
{
- long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
-
- if (timeout < 0)
- timeout = 0;
+ long timeout = nf_ct_expires(ct) / HZ;
if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
goto nla_put_failure;
@@ -818,14 +815,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
struct hlist_nulls_node *n;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
- int res;
+ struct nf_conn *nf_ct_evict[8];
+ int res, i;
spinlock_t *lockp;
last = (struct nf_conn *)cb->args[1];
+ i = 0;
local_bh_disable();
for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
+ while (i) {
+ i--;
+ if (nf_ct_should_gc(nf_ct_evict[i]))
+ nf_ct_kill(nf_ct_evict[i]);
+ nf_ct_put(nf_ct_evict[i]);
+ }
+
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
if (cb->args[0] >= nf_conntrack_htable_size) {
@@ -837,6 +843,13 @@ restart:
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
+ if (nf_ct_is_expired(ct)) {
+ if (i < ARRAY_SIZE(nf_ct_evict) &&
+ atomic_inc_not_zero(&ct->ct_general.use))
+ nf_ct_evict[i++] = ct;
+ continue;
+ }
+
if (!net_eq(net, nf_ct_net(ct)))
continue;
@@ -878,6 +891,13 @@ out:
if (last)
nf_ct_put(last);
+ while (i) {
+ i--;
+ if (nf_ct_should_gc(nf_ct_evict[i]))
+ nf_ct_kill(nf_ct_evict[i]);
+ nf_ct_put(nf_ct_evict[i]);
+ }
+
return skb->len;
}
@@ -1147,9 +1167,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
}
}
- if (del_timer(&ct->timeout))
- nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
-
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
nf_ct_put(ct);
return 0;
@@ -1517,11 +1535,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
{
u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
- if (!del_timer(&ct->timeout))
- return -ETIME;
+ ct->timeout = nfct_time_stamp + timeout * HZ;
- ct->timeout.expires = jiffies + timeout * HZ;
- add_timer(&ct->timeout);
+ if (test_bit(IPS_DYING_BIT, &ct->status))
+ return -ETIME;
return 0;
}
@@ -1719,9 +1736,8 @@ ctnetlink_create_conntrack(struct net *net,
if (!cda[CTA_TIMEOUT])
goto err1;
- ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
- ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+ ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
rcu_read_lock();
if (cda[CTA_HELP]) {
@@ -1968,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(cpu);
- if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
- nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
- nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+ if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
- nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
- nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 5588c7ae1ac2..f60a4755d71e 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
pr_debug("setting timeout of conntrack %p to 0\n", sibling);
sibling->proto.gre.timeout = 0;
sibling->proto.gre.stream_timeout = 0;
- if (del_timer(&sibling->timeout))
- sibling->timeout.function((unsigned long)sibling);
+ nf_ct_kill(sibling);
nf_ct_put(sibling);
return 1;
} else {
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index b65d5864b6d9..2d6ee1803415 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -125,6 +125,54 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+ int ret;
+
+ might_sleep();
+
+ ret = nf_ct_l3proto_try_module_get(nfproto);
+ if (ret < 0)
+ return ret;
+
+ /* we already have a reference, can't fail */
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nfproto);
+ rcu_read_unlock();
+
+ if (!l3proto->net_ns_get)
+ return 0;
+
+ ret = l3proto->net_ns_get(net);
+ if (ret < 0)
+ nf_ct_l3proto_module_put(nfproto);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, u8 nfproto)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+
+ might_sleep();
+
+ /* same as nf_conntrack_netns_get(), reference assumed */
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nfproto);
+ rcu_read_unlock();
+
+ if (WARN_ON(!l3proto))
+ return;
+
+ if (l3proto->net_ns_put)
+ l3proto->net_ns_put(net);
+
+ nf_ct_l3proto_module_put(nfproto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
struct nf_conntrack_l4proto *
nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
{
@@ -159,54 +207,6 @@ static int kill_l4proto(struct nf_conn *i, void *data)
nf_ct_l3num(i) == l4proto->l3proto;
}
-static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
- struct nf_conntrack_l3proto *l3proto)
-{
- if (l3proto->l3proto == PF_INET)
- return &net->ct.nf_ct_proto;
- else
- return NULL;
-}
-
-static int nf_ct_l3proto_register_sysctl(struct net *net,
- struct nf_conntrack_l3proto *l3proto)
-{
- int err = 0;
- struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
- /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
- if (in == NULL)
- return 0;
-
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- if (in->ctl_table != NULL) {
- err = nf_ct_register_sysctl(net,
- &in->ctl_table_header,
- l3proto->ctl_table_path,
- in->ctl_table);
- if (err < 0) {
- kfree(in->ctl_table);
- in->ctl_table = NULL;
- }
- }
-#endif
- return err;
-}
-
-static void nf_ct_l3proto_unregister_sysctl(struct net *net,
- struct nf_conntrack_l3proto *l3proto)
-{
- struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
-
- if (in == NULL)
- return;
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- if (in->ctl_table_header != NULL)
- nf_ct_unregister_sysctl(&in->ctl_table_header,
- &in->ctl_table,
- 0);
-#endif
-}
-
int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
{
int ret = 0;
@@ -238,20 +238,19 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
+#ifdef CONFIG_SYSCTL
+extern unsigned int nf_conntrack_default_on;
+
int nf_ct_l3proto_pernet_register(struct net *net,
struct nf_conntrack_l3proto *proto)
{
- int ret = 0;
-
- if (proto->init_net) {
- ret = proto->init_net(net);
- if (ret < 0)
- return ret;
- }
+ if (nf_conntrack_default_on == 0)
+ return 0;
- return nf_ct_l3proto_register_sysctl(net, proto);
+ return proto->net_ns_get ? proto->net_ns_get(net) : 0;
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
+#endif
void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
{
@@ -272,7 +271,15 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
void nf_ct_l3proto_pernet_unregister(struct net *net,
struct nf_conntrack_l3proto *proto)
{
- nf_ct_l3proto_unregister_sysctl(net, proto);
+ /*
+ * nf_conntrack_default_on *might* have registered hooks.
+ * ->net_ns_put must cope with more puts() than get(), i.e.
+ * if nf_conntrack_default_on was 0 at time of
+ * nf_ct_l3proto_pernet_register invocation this net_ns_put()
+ * should be a noop.
+ */
+ if (proto->net_ns_put)
+ proto->net_ns_put(net);
/* Remove all contrack entries for this protocol */
nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
@@ -312,26 +319,6 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
}
}
}
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
- if (err < 0) {
- nf_ct_kfree_compat_sysctl_table(pn);
- goto out;
- }
- err = nf_ct_register_sysctl(net,
- &pn->ctl_compat_header,
- "net/ipv4/netfilter",
- pn->ctl_compat_table);
- if (err == 0)
- goto out;
-
- nf_ct_kfree_compat_sysctl_table(pn);
- nf_ct_unregister_sysctl(&pn->ctl_table_header,
- &pn->ctl_table,
- pn->users);
- }
-out:
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
return err;
}
@@ -346,27 +333,20 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
nf_ct_unregister_sysctl(&pn->ctl_table_header,
&pn->ctl_table,
pn->users);
-
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
- nf_ct_unregister_sysctl(&pn->ctl_compat_header,
- &pn->ctl_compat_table,
- 0);
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
}
/* FIXME: Allow NULL functions and sub in pointers to generic for
them. --RR */
-int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
if (l4proto->l3proto >= PF_MAX)
return -EBUSY;
- if ((l4proto->to_nlattr && !l4proto->nlattr_size)
- || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
+ if ((l4proto->to_nlattr && !l4proto->nlattr_size) ||
+ (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
@@ -384,7 +364,8 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
}
for (i = 0; i < MAX_NF_CT_PROTO; i++)
- RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
+ RCU_INIT_POINTER(proto_array[i],
+ &nf_conntrack_l4proto_generic);
/* Before making proto_array visible to lockless readers,
* we must make sure its content is committed to memory.
@@ -412,10 +393,10 @@ out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one);
-int nf_ct_l4proto_pernet_register(struct net *net,
- struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_pernet_register_one(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
struct nf_proto_net *pn = NULL;
@@ -438,9 +419,9 @@ int nf_ct_l4proto_pernet_register(struct net *net,
out:
return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
-void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
{
BUG_ON(l4proto->l3proto >= PF_MAX);
@@ -455,10 +436,10 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
synchronize_rcu();
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
-void nf_ct_l4proto_pernet_unregister(struct net *net,
- struct nf_conntrack_l4proto *l4proto)
+void nf_ct_l4proto_pernet_unregister_one(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
{
struct nf_proto_net *pn = NULL;
@@ -472,6 +453,66 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
/* Remove all contrack entries for this protocol */
nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
+
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ int ret = -EINVAL, ver;
+ unsigned int i;
+
+ for (i = 0; i < num_proto; i++) {
+ ret = nf_ct_l4proto_register_one(l4proto[i]);
+ if (ret < 0)
+ break;
+ }
+ if (i != num_proto) {
+ ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4;
+ pr_err("nf_conntrack_ipv%d: can't register %s%d proto.\n",
+ ver, l4proto[i]->name, ver);
+ nf_ct_l4proto_unregister(l4proto, i);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
+
+int nf_ct_l4proto_pernet_register(struct net *net,
+ struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ int ret = -EINVAL;
+ unsigned int i;
+
+ for (i = 0; i < num_proto; i++) {
+ ret = nf_ct_l4proto_pernet_register_one(net, l4proto[i]);
+ if (ret < 0)
+ break;
+ }
+ if (i != num_proto) {
+ pr_err("nf_conntrack_%s%d: pernet registration failed\n",
+ l4proto[i]->name,
+ l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
+ nf_ct_l4proto_pernet_unregister(net, l4proto, i);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ while (num_proto-- != 0)
+ nf_ct_l4proto_unregister_one(l4proto[num_proto]);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+
+void nf_ct_l4proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ while (num_proto-- != 0)
+ nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]);
+}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
int nf_conntrack_proto_pernet_init(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 399a38fd685a..b68ce6ac13b3 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -9,7 +9,6 @@
*
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
@@ -384,17 +383,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
},
};
-/* this module per-net specifics */
-static int dccp_net_id __read_mostly;
-struct dccp_net {
- struct nf_proto_net pn;
- int dccp_loose;
- unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-
-static inline struct dccp_net *dccp_pernet(struct net *net)
+static inline struct nf_dccp_net *dccp_pernet(struct net *net)
{
- return net_generic(net, dccp_net_id);
+ return &net->ct.nf_ct_proto.dccp;
}
static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -402,7 +393,8 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
{
struct dccp_hdr _hdr, *dh;
- dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ /* Actually only need first 4 bytes to get ports. */
+ dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
if (dh == NULL)
return false;
@@ -423,7 +415,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
struct net *net = nf_ct_net(ct);
- struct dccp_net *dn;
+ struct nf_dccp_net *dn;
struct dccp_hdr _dh, *dh;
const char *msg;
u_int8_t state;
@@ -718,7 +710,7 @@ static int dccp_nlattr_size(void)
static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- struct dccp_net *dn = dccp_pernet(net);
+ struct nf_dccp_net *dn = dccp_pernet(net);
unsigned int *timeouts = data;
int i;
@@ -819,7 +811,7 @@ static struct ctl_table dccp_sysctl_table[] = {
#endif /* CONFIG_SYSCTL */
static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
- struct dccp_net *dn)
+ struct nf_dccp_net *dn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -849,7 +841,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
static int dccp_init_net(struct net *net, u_int16_t proto)
{
- struct dccp_net *dn = dccp_pernet(net);
+ struct nf_dccp_net *dn = dccp_pernet(net);
struct nf_proto_net *pn = &dn->pn;
if (!pn->users) {
@@ -867,7 +859,7 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
return dccp_kmemdup_sysctl_table(net, pn, dn);
}
-static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
.l3proto = AF_INET,
.l4proto = IPPROTO_DCCP,
.name = "dccp",
@@ -897,11 +889,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
.nla_policy = dccp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &dccp_net_id,
.init_net = dccp_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
-static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
.l3proto = AF_INET6,
.l4proto = IPPROTO_DCCP,
.name = "dccp",
@@ -931,78 +923,6 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
.nla_policy = dccp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &dccp_net_id,
.init_net = dccp_init_net,
};
-
-static __net_init int dccp_net_init(struct net *net)
-{
- int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4);
- if (ret < 0) {
- pr_err("nf_conntrack_dccp4: pernet registration failed.\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6);
- if (ret < 0) {
- pr_err("nf_conntrack_dccp6: pernet registration failed.\n");
- goto cleanup_dccp4;
- }
- return 0;
-cleanup_dccp4:
- nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
-out:
- return ret;
-}
-
-static __net_exit void dccp_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, &dccp_proto6);
- nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
-}
-
-static struct pernet_operations dccp_net_ops = {
- .init = dccp_net_init,
- .exit = dccp_net_exit,
- .id = &dccp_net_id,
- .size = sizeof(struct dccp_net),
-};
-
-static int __init nf_conntrack_proto_dccp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&dccp_net_ops);
- if (ret < 0)
- goto out_pernet;
-
- ret = nf_ct_l4proto_register(&dccp_proto4);
- if (ret < 0)
- goto out_dccp4;
-
- ret = nf_ct_l4proto_register(&dccp_proto6);
- if (ret < 0)
- goto out_dccp6;
-
- return 0;
-out_dccp6:
- nf_ct_l4proto_unregister(&dccp_proto4);
-out_dccp4:
- unregister_pernet_subsys(&dccp_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_conntrack_proto_dccp_fini(void)
-{
- nf_ct_l4proto_unregister(&dccp_proto6);
- nf_ct_l4proto_unregister(&dccp_proto4);
- unregister_pernet_subsys(&dccp_net_ops);
-}
-
-module_init(nf_conntrack_proto_dccp_init);
-module_exit(nf_conntrack_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 86dc752e5349..d5868bad33a7 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -151,17 +151,6 @@ static struct ctl_table generic_sysctl_table[] = {
},
{ }
};
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table generic_compat_sysctl_table[] = {
- {
- .procname = "ip_conntrack_generic_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -179,40 +168,14 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
- struct nf_generic_net *gn)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
- sizeof(generic_compat_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_compat_table)
- return -ENOMEM;
-
- pn->ctl_compat_table[0].data = &gn->timeout;
-#endif
-#endif
- return 0;
-}
-
static int generic_init_net(struct net *net, u_int16_t proto)
{
- int ret;
struct nf_generic_net *gn = generic_pernet(net);
struct nf_proto_net *pn = &gn->pn;
gn->timeout = nf_ct_generic_timeout;
- ret = generic_kmemdup_compat_sysctl_table(pn, gn);
- if (ret < 0)
- return ret;
-
- ret = generic_kmemdup_sysctl_table(pn, gn);
- if (ret < 0)
- nf_ct_kfree_compat_sysctl_table(pn);
-
- return ret;
+ return generic_kmemdup_sysctl_table(pn, gn);
}
static struct nf_proto_net *generic_get_net_proto(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a96451a7af20..87bb40a3feb5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -53,7 +53,7 @@ static unsigned int gre_timeouts[GRE_CT_MAX] = {
[GRE_CT_REPLIED] = 180*HZ,
};
-static int proto_gre_net_id __read_mostly;
+static unsigned int proto_gre_net_id __read_mostly;
struct netns_proto_gre {
struct nf_proto_net nf;
rwlock_t keymap_lock;
@@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct net *net, struct nf_conntrack_tuple *tuple)
{
- const struct gre_hdr_pptp *pgrehdr;
- struct gre_hdr_pptp _pgrehdr;
+ const struct pptp_gre_header *pgrehdr;
+ struct pptp_gre_header _pgrehdr;
__be16 srckey;
- const struct gre_hdr *grehdr;
- struct gre_hdr _grehdr;
+ const struct gre_base_hdr *grehdr;
+ struct gre_base_hdr _grehdr;
/* first only delinearize old RFC1701 GRE header */
grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
- if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+ if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) {
/* try to behave like "nf_conntrack_proto_generic" */
tuple->src.u.all = 0;
tuple->dst.u.all = 0;
@@ -212,8 +212,8 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
if (!pgrehdr)
return true;
- if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
- pr_debug("GRE_VERSION_PPTP but unknown proto\n");
+ if (grehdr->protocol != GRE_PROTO_PPP) {
+ pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol));
return false;
}
@@ -396,7 +396,9 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
static int proto_gre_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4);
+
+ ret = nf_ct_l4proto_pernet_register_one(net,
+ &nf_conntrack_l4proto_gre4);
if (ret < 0)
pr_err("nf_conntrack_gre4: pernet registration failed.\n");
return ret;
@@ -404,7 +406,7 @@ static int proto_gre_net_init(struct net *net)
static void proto_gre_net_exit(struct net *net)
{
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
+ nf_ct_l4proto_pernet_unregister_one(net, &nf_conntrack_l4proto_gre4);
nf_ct_gre_keymap_flush(net);
}
@@ -422,8 +424,7 @@ static int __init nf_ct_proto_gre_init(void)
ret = register_pernet_subsys(&proto_gre_net_ops);
if (ret < 0)
goto out_pernet;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
+ ret = nf_ct_l4proto_register_one(&nf_conntrack_l4proto_gre4);
if (ret < 0)
goto out_gre4;
@@ -436,7 +437,7 @@ out_pernet:
static void __exit nf_ct_proto_gre_fini(void)
{
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+ nf_ct_l4proto_unregister_one(&nf_conntrack_l4proto_gre4);
unregister_pernet_subsys(&proto_gre_net_ops);
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 1d7ab960a9e6..a0efde38da44 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -15,7 +15,6 @@
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
-#include <linux/module.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/sctp.h>
@@ -144,15 +143,9 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
}
};
-static int sctp_net_id __read_mostly;
-struct sctp_net {
- struct nf_proto_net pn;
- unsigned int timeouts[SCTP_CONNTRACK_MAX];
-};
-
-static inline struct sctp_net *sctp_pernet(struct net *net)
+static inline struct nf_sctp_net *sctp_pernet(struct net *net)
{
- return net_generic(net, sctp_net_id);
+ return &net->ct.nf_ct_proto.sctp;
}
static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -161,8 +154,8 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
const struct sctphdr *hp;
struct sctphdr _hdr;
- /* Actually only need first 8 bytes. */
- hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ /* Actually only need first 4 bytes to get ports. */
+ hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
if (hp == NULL)
return false;
@@ -600,7 +593,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
unsigned int *timeouts = data;
- struct sctp_net *sn = sctp_pernet(net);
+ struct nf_sctp_net *sn = sctp_pernet(net);
int i;
/* set default SCTP timeouts. */
@@ -705,58 +698,10 @@ static struct ctl_table sctp_sysctl_table[] = {
},
{ }
};
-
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table sctp_compat_sysctl_table[] = {
- {
- .procname = "ip_conntrack_sctp_timeout_closed",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_sctp_timeout_cookie_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_sctp_timeout_established",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif
static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct sctp_net *sn)
+ struct nf_sctp_net *sn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -781,33 +726,9 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
- struct sctp_net *sn)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
- sizeof(sctp_compat_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_compat_table)
- return -ENOMEM;
-
- pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
- pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
- pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
- pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
- pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
- pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
- pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
-#endif
-#endif
- return 0;
-}
-
static int sctp_init_net(struct net *net, u_int16_t proto)
{
- int ret;
- struct sctp_net *sn = sctp_pernet(net);
+ struct nf_sctp_net *sn = sctp_pernet(net);
struct nf_proto_net *pn = &sn->pn;
if (!pn->users) {
@@ -817,21 +738,10 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
sn->timeouts[i] = sctp_timeouts[i];
}
- if (proto == AF_INET) {
- ret = sctp_kmemdup_compat_sysctl_table(pn, sn);
- if (ret < 0)
- return ret;
-
- ret = sctp_kmemdup_sysctl_table(pn, sn);
- if (ret < 0)
- nf_ct_kfree_compat_sysctl_table(pn);
- } else
- ret = sctp_kmemdup_sysctl_table(pn, sn);
-
- return ret;
+ return sctp_kmemdup_sysctl_table(pn, sn);
}
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.l3proto = PF_INET,
.l4proto = IPPROTO_SCTP,
.name = "sctp",
@@ -861,11 +771,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.nla_policy = sctp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &sctp_net_id,
.init_net = sctp_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
.l3proto = PF_INET6,
.l4proto = IPPROTO_SCTP,
.name = "sctp",
@@ -895,81 +805,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
#endif
- .net_id = &sctp_net_id,
.init_net = sctp_init_net,
};
-
-static int sctp_net_init(struct net *net)
-{
- int ret = 0;
-
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4);
- if (ret < 0) {
- pr_err("nf_conntrack_sctp4: pernet registration failed.\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6);
- if (ret < 0) {
- pr_err("nf_conntrack_sctp6: pernet registration failed.\n");
- goto cleanup_sctp4;
- }
- return 0;
-
-cleanup_sctp4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
-out:
- return ret;
-}
-
-static void sctp_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
-}
-
-static struct pernet_operations sctp_net_ops = {
- .init = sctp_net_init,
- .exit = sctp_net_exit,
- .id = &sctp_net_id,
- .size = sizeof(struct sctp_net),
-};
-
-static int __init nf_conntrack_proto_sctp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&sctp_net_ops);
- if (ret < 0)
- goto out_pernet;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
- if (ret < 0)
- goto out_sctp4;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6);
- if (ret < 0)
- goto out_sctp6;
-
- return 0;
-out_sctp6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
-out_sctp4:
- unregister_pernet_subsys(&sctp_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_conntrack_proto_sctp_fini(void)
-{
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
- unregister_pernet_subsys(&sctp_net_ops);
-}
-
-module_init(nf_conntrack_proto_sctp_init);
-module_exit(nf_conntrack_proto_sctp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kiran Kumar Immidi");
-MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
-MODULE_ALIAS("ip_conntrack_proto_sctp");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 70c8381641a7..69f687740c76 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -282,8 +282,8 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
const struct tcphdr *hp;
struct tcphdr _hdr;
- /* Actually only need first 8 bytes. */
- hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ /* Actually only need first 4 bytes to get ports. */
+ hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
if (hp == NULL)
return false;
@@ -1481,90 +1481,6 @@ static struct ctl_table tcp_sysctl_table[] = {
},
{ }
};
-
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table tcp_compat_sysctl_table[] = {
- {
- .procname = "ip_conntrack_tcp_timeout_syn_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_syn_sent2",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_syn_recv",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_established",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_fin_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_close_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_last_ack",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_time_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_close",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_timeout_max_retrans",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_tcp_loose",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ip_conntrack_tcp_be_liberal",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ip_conntrack_tcp_max_retrans",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -1597,38 +1513,8 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
- struct nf_tcp_net *tn)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
- sizeof(tcp_compat_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_compat_table)
- return -ENOMEM;
-
- pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
- pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
- pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
- pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
- pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
- pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
- pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
- pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
- pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
- pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
- pn->ctl_compat_table[10].data = &tn->tcp_loose;
- pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
- pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
-#endif
-#endif
- return 0;
-}
-
static int tcp_init_net(struct net *net, u_int16_t proto)
{
- int ret;
struct nf_tcp_net *tn = tcp_pernet(net);
struct nf_proto_net *pn = &tn->pn;
@@ -1643,18 +1529,7 @@ static int tcp_init_net(struct net *net, u_int16_t proto)
tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
}
- if (proto == AF_INET) {
- ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
- if (ret < 0)
- return ret;
-
- ret = tcp_kmemdup_sysctl_table(pn, tn);
- if (ret < 0)
- nf_ct_kfree_compat_sysctl_table(pn);
- } else
- ret = tcp_kmemdup_sysctl_table(pn, tn);
-
- return ret;
+ return tcp_kmemdup_sysctl_table(pn, tn);
}
static struct nf_proto_net *tcp_get_net_proto(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 4fd040575ffe..20f35ed68030 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -44,8 +44,8 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb,
const struct udphdr *hp;
struct udphdr _hdr;
- /* Actually only need first 8 bytes. */
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ /* Actually only need first 4 bytes to get ports. */
+ hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
if (hp == NULL)
return false;
@@ -218,23 +218,6 @@ static struct ctl_table udp_sysctl_table[] = {
},
{ }
};
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table udp_compat_sysctl_table[] = {
- {
- .procname = "ip_conntrack_udp_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "ip_conntrack_udp_timeout_stream",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -254,27 +237,8 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
- struct nf_udp_net *un)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
- sizeof(udp_compat_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_compat_table)
- return -ENOMEM;
-
- pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
- pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED];
-#endif
-#endif
- return 0;
-}
-
static int udp_init_net(struct net *net, u_int16_t proto)
{
- int ret;
struct nf_udp_net *un = udp_pernet(net);
struct nf_proto_net *pn = &un->pn;
@@ -285,18 +249,7 @@ static int udp_init_net(struct net *net, u_int16_t proto)
un->timeouts[i] = udp_timeouts[i];
}
- if (proto == AF_INET) {
- ret = udp_kmemdup_compat_sysctl_table(pn, un);
- if (ret < 0)
- return ret;
-
- ret = udp_kmemdup_sysctl_table(pn, un);
- if (ret < 0)
- nf_ct_kfree_compat_sysctl_table(pn);
- } else
- ret = udp_kmemdup_sysctl_table(pn, un);
-
- return ret;
+ return udp_kmemdup_sysctl_table(pn, un);
}
static struct nf_proto_net *udp_get_net_proto(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 9d692f5adb94..c35f7bf05d8c 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/timer.h>
-#include <linux/module.h>
#include <linux/udp.h>
#include <linux/seq_file.h>
#include <linux/skbuff.h>
@@ -24,26 +23,14 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_log.h>
-enum udplite_conntrack {
- UDPLITE_CT_UNREPLIED,
- UDPLITE_CT_REPLIED,
- UDPLITE_CT_MAX
-};
-
static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
[UDPLITE_CT_UNREPLIED] = 30*HZ,
[UDPLITE_CT_REPLIED] = 180*HZ,
};
-static int udplite_net_id __read_mostly;
-struct udplite_net {
- struct nf_proto_net pn;
- unsigned int timeouts[UDPLITE_CT_MAX];
-};
-
-static inline struct udplite_net *udplite_pernet(struct net *net)
+static inline struct nf_udplite_net *udplite_pernet(struct net *net)
{
- return net_generic(net, udplite_net_id);
+ return &net->ct.nf_ct_proto.udplite;
}
static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
@@ -54,7 +41,8 @@ static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
const struct udphdr *hp;
struct udphdr _hdr;
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ /* Actually only need first 4 bytes to get ports. */
+ hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
if (hp == NULL)
return false;
@@ -177,7 +165,7 @@ static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
unsigned int *timeouts = data;
- struct udplite_net *un = udplite_pernet(net);
+ struct nf_udplite_net *un = udplite_pernet(net);
/* set default timeouts for UDPlite. */
timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
@@ -236,7 +224,7 @@ static struct ctl_table udplite_sysctl_table[] = {
#endif /* CONFIG_SYSCTL */
static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct udplite_net *un)
+ struct nf_udplite_net *un)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -256,7 +244,7 @@ static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
static int udplite_init_net(struct net *net, u_int16_t proto)
{
- struct udplite_net *un = udplite_pernet(net);
+ struct nf_udplite_net *un = udplite_pernet(net);
struct nf_proto_net *pn = &un->pn;
if (!pn->users) {
@@ -269,7 +257,7 @@ static int udplite_init_net(struct net *net, u_int16_t proto)
return udplite_kmemdup_sysctl_table(pn, un);
}
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
@@ -298,11 +286,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.nla_policy = udplite_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &udplite_net_id,
.init_net = udplite_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
{
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
@@ -331,78 +319,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.nla_policy = udplite_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &udplite_net_id,
.init_net = udplite_init_net,
};
-
-static int udplite_net_init(struct net *net)
-{
- int ret = 0;
-
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4);
- if (ret < 0) {
- pr_err("nf_conntrack_udplite4: pernet registration failed.\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6);
- if (ret < 0) {
- pr_err("nf_conntrack_udplite6: pernet registration failed.\n");
- goto cleanup_udplite4;
- }
- return 0;
-
-cleanup_udplite4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
-out:
- return ret;
-}
-
-static void udplite_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
-}
-
-static struct pernet_operations udplite_net_ops = {
- .init = udplite_net_init,
- .exit = udplite_net_exit,
- .id = &udplite_net_id,
- .size = sizeof(struct udplite_net),
-};
-
-static int __init nf_conntrack_proto_udplite_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&udplite_net_ops);
- if (ret < 0)
- goto out_pernet;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
- if (ret < 0)
- goto out_udplite4;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6);
- if (ret < 0)
- goto out_udplite6;
-
- return 0;
-out_udplite6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
-out_udplite4:
- unregister_pernet_subsys(&udplite_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_conntrack_proto_udplite_exit(void)
-{
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
- unregister_pernet_subsys(&udplite_net_ops);
-}
-
-module_init(nf_conntrack_proto_udplite_init);
-module_exit(nf_conntrack_proto_udplite_exit);
-
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index dff0f0cc59e4..ef7063eced7c 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -169,7 +169,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
s32 seqoff, ackoff;
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
struct nf_ct_seqadj *this_way, *other_way;
- int res;
+ int res = 1;
this_way = &seqadj->seq[dir];
other_way = &seqadj->seq[!dir];
@@ -184,27 +184,31 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
else
seqoff = this_way->offset_before;
+ newseq = htonl(ntohl(tcph->seq) + seqoff);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
+ pr_debug("Adjusting sequence number from %u->%u\n",
+ ntohl(tcph->seq), ntohl(newseq));
+ tcph->seq = newseq;
+
+ if (!tcph->ack)
+ goto out;
+
if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
other_way->correction_pos))
ackoff = other_way->offset_after;
else
ackoff = other_way->offset_before;
- newseq = htonl(ntohl(tcph->seq) + seqoff);
newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
- inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
false);
-
- pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+ pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n",
ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
ntohl(newack));
-
- tcph->seq = newseq;
tcph->ack_seq = newack;
res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+out:
spin_unlock_bh(&ct->lock);
return res;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 7d77217de6a3..c3fc14e021ec 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -83,9 +83,10 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
static int iswordc(const char c)
{
if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
- (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ (c >= '(' && c <= '+') || c == ':' || c == '<' || c == '>' ||
c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
- c == '{' || c == '}' || c == '~')
+ c == '{' || c == '}' || c == '~' || (c >= '-' && c <= '/') ||
+ c == '\'')
return 1;
return 0;
}
@@ -329,13 +330,12 @@ static const char *sip_follow_continuation(const char *dptr, const char *limit)
static const char *sip_skip_whitespace(const char *dptr, const char *limit)
{
for (; dptr < limit; dptr++) {
- if (*dptr == ' ')
+ if (*dptr == ' ' || *dptr == '\t')
continue;
if (*dptr != '\r' && *dptr != '\n')
break;
dptr = sip_follow_continuation(dptr, limit);
- if (dptr == NULL)
- return NULL;
+ break;
}
return dptr;
}
@@ -1436,9 +1436,12 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
handler = &sip_handlers[i];
if (handler->request == NULL)
continue;
- if (*datalen < handler->len ||
+ if (*datalen < handler->len + 2 ||
strncasecmp(*dptr, handler->method, handler->len))
continue;
+ if ((*dptr)[handler->len] != ' ' ||
+ !isalpha((*dptr)[handler->len+1]))
+ continue;
if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
&matchoff, &matchlen) <= 0) {
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9f267c3ffb39..d009ae663453 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -212,6 +212,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
return 0;
+ if (nf_ct_should_gc(ct)) {
+ nf_ct_kill(ct);
+ goto release;
+ }
+
/* we only want to print DIR_ORIGINAL */
if (NF_CT_DIRECTION(hash))
goto release;
@@ -228,8 +233,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
seq_printf(s, "%-8s %u %-8s %u %ld ",
l3proto->name, nf_ct_l3num(ct),
l4proto->name, nf_ct_protonum(ct),
- timer_pending(&ct->timeout)
- ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
+ nf_ct_expires(ct) / HZ);
if (l4proto->print_conntrack)
l4proto->print_conntrack(s, ct);
@@ -353,13 +357,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- st->searched,
+ 0,
st->found,
- st->new,
+ 0,
st->invalid,
st->ignore,
- st->delete,
- st->delete_list,
+ 0,
+ 0,
st->insert,
st->insert_failed,
st->drop,
@@ -448,6 +452,9 @@ static int log_invalid_proto_max __read_mostly = 255;
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
+extern unsigned int nf_conntrack_default_on;
+unsigned int nf_conntrack_default_on __read_mostly = 1;
+
static int
nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +520,13 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "nf_conntrack_default_on",
+ .data = &nf_conntrack_default_on,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 7ec69723940f..c9d7f95768ab 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -14,24 +14,41 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+{
+ if (skb_mac_header_was_set(skb))
+ skb_push(skb, skb->mac_len);
+
+ skb->dev = dev;
+ dev_queue_xmit(skb);
+}
+
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+ if (!dev) {
+ kfree_skb(pkt->skb);
+ return;
+ }
+
+ nf_do_netdev_egress(pkt->skb, dev);
+}
+EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
+
void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
{
struct net_device *dev;
struct sk_buff *skb;
- dev = dev_get_by_index_rcu(pkt->net, oif);
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
if (dev == NULL)
return;
skb = skb_clone(pkt->skb, GFP_ATOMIC);
- if (skb == NULL)
- return;
-
- if (skb_mac_header_was_set(skb))
- skb_push(skb, skb->mac_len);
-
- skb->dev = dev;
- dev_queue_xmit(skb);
+ if (skb)
+ nf_do_netdev_egress(skb, dev);
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 065522564ac6..c46d214d5323 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -11,15 +11,10 @@
#define NFDEBUG(format, args...)
#endif
-
-/* core.c */
-unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
- struct nf_hook_state *state, struct nf_hook_ops **elemp);
-
/* nf_queue.c */
-int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
- struct nf_hook_state *state, unsigned int queuenum);
-void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops);
+int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
+ struct nf_hook_entry **entryp, unsigned int verdict);
+void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry);
int __init netfilter_queue_init(void);
/* nf_log.c */
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index aa5847a16713..ffb9e8ada899 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -13,7 +13,6 @@
/* Internal logging interface, which relies on the real
LOG target modules */
-#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
@@ -39,12 +38,12 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
return NULL;
}
-void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
{
const struct nf_logger *log;
- if (pf == NFPROTO_UNSPEC)
- return;
+ if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return -EOPNOTSUPP;
mutex_lock(&nf_log_mutex);
log = nft_log_dereference(net->nf.nf_loggers[pf]);
@@ -52,6 +51,8 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
mutex_unlock(&nf_log_mutex);
+
+ return 0;
}
EXPORT_SYMBOL(nf_log_set);
@@ -420,7 +421,7 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
char buf[NFLOGGER_NAME_LEN];
int r = 0;
int tindex = (unsigned long)table->extra1;
- struct net *net = current->nsproxy->net_ns;
+ struct net *net = table->extra2;
if (write) {
struct ctl_table tmp = *table;
@@ -474,7 +475,6 @@ static int netfilter_log_sysctl_init(struct net *net)
3, "%d", i);
nf_log_sysctl_table[i].procname =
nf_log_sysctl_fnames[i];
- nf_log_sysctl_table[i].data = NULL;
nf_log_sysctl_table[i].maxlen = NFLOGGER_NAME_LEN;
nf_log_sysctl_table[i].mode = 0644;
nf_log_sysctl_table[i].proc_handler =
@@ -484,6 +484,9 @@ static int netfilter_log_sysctl_init(struct net *net)
}
}
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
+ table[i].extra2 = net;
+
net->nf.nf_log_dir_header = register_net_sysctl(net,
"net/netfilter/nf_log",
table);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index a5aa5967b8e1..dc61399e30be 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -77,7 +77,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
nf_log_buf_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & XT_LOG_TCPSEQ) {
+ if (logflags & NF_LOG_TCPSEQ) {
nf_log_buf_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
}
@@ -107,7 +107,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
/* Max length: 11 "URGP=65535 " */
nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
- if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
+ if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
u_int8_t _opt[60 - sizeof(struct tcphdr)];
const u_int8_t *op;
unsigned int i;
@@ -175,6 +175,34 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
}
EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
+/* bridge and netdev logging families share this code. */
+void nf_log_l2packet(struct net *net, u_int8_t pf,
+ __be16 protocol,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ switch (protocol) {
+ case htons(ETH_P_IP):
+ nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_log_l2packet);
+
static int __init nf_log_common_init(void)
{
return 0;
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
new file mode 100644
index 000000000000..350eb147754d
--- /dev/null
+++ b/net/netfilter/nf_log_netdev.c
@@ -0,0 +1,81 @@
+/*
+ * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_log.h>
+
+static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
+ loginfo, prefix);
+}
+
+static struct nf_logger nf_netdev_logger __read_mostly = {
+ .name = "nf_log_netdev",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_netdev_net_init(struct net *net)
+{
+ return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
+}
+
+static void __net_exit nf_log_netdev_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_netdev_logger);
+}
+
+static struct pernet_operations nf_log_netdev_net_ops = {
+ .init = nf_log_netdev_net_init,
+ .exit = nf_log_netdev_net_exit,
+};
+
+static int __init nf_log_netdev_init(void)
+{
+ int ret;
+
+ /* Request to load the real packet loggers. */
+ nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
+ nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
+ nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
+
+ ret = register_pernet_subsys(&nf_log_netdev_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
+ return 0;
+}
+
+static void __exit nf_log_netdev_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_netdev_net_ops);
+ nf_log_unregister(&nf_netdev_logger);
+}
+
+module_init(nf_log_netdev_init);
+module_exit(nf_log_netdev_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter netdev packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index ecee105bbada..94b14c5a8b17 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -42,7 +42,7 @@ struct nf_nat_conn_key {
const struct nf_conntrack_zone *zone;
};
-static struct rhashtable nf_nat_bysource_table;
+static struct rhltable nf_nat_bysource_table;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -193,9 +193,12 @@ static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
const struct nf_nat_conn_key *key = arg->key;
const struct nf_conn *ct = obj;
- return same_src(ct, key->tuple) &&
- net_eq(nf_ct_net(ct), key->net) &&
- nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL);
+ if (!same_src(ct, key->tuple) ||
+ !net_eq(nf_ct_net(ct), key->net) ||
+ !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
+ return 1;
+
+ return 0;
}
static struct rhashtable_params nf_nat_bysource_params = {
@@ -204,7 +207,6 @@ static struct rhashtable_params nf_nat_bysource_params = {
.obj_cmpfn = nf_nat_bysource_cmp,
.nelem_hint = 256,
.min_size = 1024,
- .nulls_base = (1U << RHT_BASE_SHIFT),
};
/* Only called for SRC manip */
@@ -223,12 +225,15 @@ find_appropriate_src(struct net *net,
.tuple = tuple,
.zone = zone
};
+ struct rhlist_head *hl;
- ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key,
- nf_nat_bysource_params);
- if (!ct)
+ hl = rhltable_lookup(&nf_nat_bysource_table, &key,
+ nf_nat_bysource_params);
+ if (!hl)
return 0;
+ ct = container_of(hl, typeof(*ct), nat_bysource);
+
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
@@ -446,11 +451,17 @@ nf_nat_setup_info(struct nf_conn *ct,
}
if (maniptype == NF_NAT_MANIP_SRC) {
+ struct nf_nat_conn_key key = {
+ .net = nf_ct_net(ct),
+ .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ .zone = nf_ct_zone(ct),
+ };
int err;
- err = rhashtable_insert_fast(&nf_nat_bysource_table,
- &ct->nat_bysource,
- nf_nat_bysource_params);
+ err = rhltable_insert_key(&nf_nat_bysource_table,
+ &key,
+ &ct->nat_bysource,
+ nf_nat_bysource_params);
if (err)
return NF_DROP;
}
@@ -566,15 +577,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
* will delete entry from already-freed table.
*/
- if (!del_timer(&ct->timeout))
- return 1;
-
ct->status &= ~IPS_NAT_DONE_MASK;
-
- rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
-
- add_timer(&ct->timeout);
+ rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+ nf_nat_bysource_params);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -677,6 +682,18 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
&nf_nat_l4proto_tcp);
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
&nf_nat_l4proto_udp);
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
+ &nf_nat_l4proto_dccp);
+#endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
+ &nf_nat_l4proto_sctp);
+#endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
+ &nf_nat_l4proto_udplite);
+#endif
mutex_unlock(&nf_nat_proto_mutex);
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
@@ -704,8 +721,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
if (!nat)
return;
- rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+ nf_nat_bysource_params);
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -840,13 +857,13 @@ static int __init nf_nat_init(void)
{
int ret;
- ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
+ ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
if (ret)
return ret;
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- rhashtable_destroy(&nf_nat_bysource_table);
+ rhltable_destroy(&nf_nat_bysource_table);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -870,7 +887,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
- rhashtable_destroy(&nf_nat_bysource_table);
+ rhltable_destroy(&nf_nat_bysource_table);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -889,7 +906,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
- rhashtable_destroy(&nf_nat_bysource_table);
+ rhltable_destroy(&nf_nat_bysource_table);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 15c47b246d0d..269fcd5dc34c 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -10,8 +10,6 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/dccp.h>
@@ -73,7 +71,7 @@ dccp_manip_pkt(struct sk_buff *skb,
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.l4proto = IPPROTO_DCCP,
.manip_pkt = dccp_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -82,35 +80,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_dccp_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_dccp_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
-
-}
-
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP NAT protocol helper");
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index cbc7ade1487b..31d358691af0 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -7,9 +7,7 @@
*/
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/sctp.h>
-#include <linux/module.h>
#include <net/sctp/checksum.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -49,12 +47,15 @@ sctp_manip_pkt(struct sk_buff *skb,
hdr->dest = tuple->dst.u.sctp.port;
}
- hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ skb->ip_summed = CHECKSUM_NONE;
+ }
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.l4proto = IPPROTO_SCTP,
.manip_pkt = sctp_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -63,34 +64,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_sctp_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_sctp_exit(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
-}
-
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
index 58340c97bd83..366bfbfd82a1 100644
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -8,11 +8,9 @@
*/
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/udp.h>
#include <linux/netfilter.h>
-#include <linux/module.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -64,7 +62,7 @@ udplite_manip_pkt(struct sk_buff *skb,
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
.l4proto = IPPROTO_UDPLITE,
.manip_pkt = udplite_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -73,34 +71,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_udplite_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_udplite_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
-}
-
-module_init(nf_nat_proto_udplite_init);
-module_exit(nf_nat_proto_udplite_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b19ad20a705c..4a7662486f44 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -96,25 +96,19 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
-void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
+void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
{
const struct nf_queue_handler *qh;
rcu_read_lock();
qh = rcu_dereference(net->nf.queue_handler);
if (qh)
- qh->nf_hook_drop(net, ops);
+ qh->nf_hook_drop(net, entry);
rcu_read_unlock();
}
-/*
- * Any packet that leaves via this function must come back
- * through nf_reinject().
- */
-int nf_queue(struct sk_buff *skb,
- struct nf_hook_ops *elem,
- struct nf_hook_state *state,
- unsigned int queuenum)
+static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
+ struct nf_hook_entry *hook_entry, unsigned int queuenum)
{
int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
@@ -141,8 +135,8 @@ int nf_queue(struct sk_buff *skb,
*entry = (struct nf_queue_entry) {
.skb = skb,
- .elem = elem,
.state = *state,
+ .hook = hook_entry,
.size = sizeof(*entry) + afinfo->route_key_size,
};
@@ -163,10 +157,50 @@ err:
return status;
}
+/* Packets leaving via this function must come back through nf_reinject(). */
+int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
+ struct nf_hook_entry **entryp, unsigned int verdict)
+{
+ struct nf_hook_entry *entry = *entryp;
+ int ret;
+
+ ret = __nf_queue(skb, state, entry, verdict >> NF_VERDICT_QBITS);
+ if (ret < 0) {
+ if (ret == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) {
+ *entryp = rcu_dereference(entry->next);
+ return 1;
+ }
+ kfree_skb(skb);
+ }
+
+ return 0;
+}
+
+static unsigned int nf_iterate(struct sk_buff *skb,
+ struct nf_hook_state *state,
+ struct nf_hook_entry **entryp)
+{
+ unsigned int verdict;
+
+ do {
+repeat:
+ verdict = nf_hook_entry_hookfn((*entryp), skb, state);
+ if (verdict != NF_ACCEPT) {
+ if (verdict != NF_REPEAT)
+ return verdict;
+ goto repeat;
+ }
+ *entryp = rcu_dereference((*entryp)->next);
+ } while (*entryp);
+
+ return NF_ACCEPT;
+}
+
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
+ struct nf_hook_entry *hook_entry = entry->hook;
struct sk_buff *skb = entry->skb;
- struct nf_hook_ops *elem = entry->elem;
const struct nf_afinfo *afinfo;
int err;
@@ -174,7 +208,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
/* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT)
- verdict = elem->hook(elem->priv, skb, &entry->state);
+ verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
if (verdict == NF_ACCEPT) {
afinfo = nf_get_afinfo(entry->state.pf);
@@ -182,29 +216,27 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
verdict = NF_DROP;
}
- entry->state.thresh = INT_MIN;
-
if (verdict == NF_ACCEPT) {
- next_hook:
- verdict = nf_iterate(entry->state.hook_list,
- skb, &entry->state, &elem);
+ hook_entry = rcu_dereference(hook_entry->next);
+ if (hook_entry)
+next_hook:
+ verdict = nf_iterate(skb, &entry->state, &hook_entry);
}
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
case NF_STOP:
+okfn:
local_bh_disable();
entry->state.okfn(entry->state.net, entry->state.sk, skb);
local_bh_enable();
break;
case NF_QUEUE:
- err = nf_queue(skb, elem, &entry->state,
- verdict >> NF_VERDICT_QBITS);
- if (err < 0) {
- if (err == -ESRCH &&
- (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ err = nf_queue(skb, &entry->state, &hook_entry, verdict);
+ if (err == 1) {
+ if (hook_entry)
goto next_hook;
- kfree_skb(skb);
+ goto okfn;
}
break;
case NF_STOLEN:
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index c8a4a48bced9..7c6d1fbe38b9 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -24,7 +24,7 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_zones.h>
-int synproxy_net_id;
+unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
bool
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7e1c876c7608..1b913760f205 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -22,6 +22,7 @@
#include <net/sock.h>
static LIST_HEAD(nf_tables_expressions);
+static LIST_HEAD(nf_tables_objects);
/**
* nft_register_afinfo - register nf_tables address family info
@@ -110,12 +111,12 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->seq = nlh->nlmsg_seq;
}
-static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
- u32 size)
+static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
+ int msg_type, u32 size, gfp_t gfp)
{
struct nft_trans *trans;
- trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL);
+ trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
if (trans == NULL)
return NULL;
@@ -125,6 +126,12 @@ static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
return trans;
}
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
+{
+ return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
+}
+
static void nft_trans_destroy(struct nft_trans *trans)
{
list_del(&trans->list);
@@ -304,6 +311,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
return err;
}
+static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_object *obj)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWOBJ)
+ nft_activate_next(ctx->net, obj);
+
+ nft_trans_obj(trans) = obj;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+}
+
+static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
+{
+ int err;
+
+ err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
+ if (err < 0)
+ return err;
+
+ nft_deactivate_next(ctx->net, obj);
+ ctx->table->use--;
+
+ return err;
+}
+
/*
* Tables
*/
@@ -688,6 +727,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
+ INIT_LIST_HEAD(&table->objects);
table->flags = flags;
nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -709,6 +749,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
{
int err;
struct nft_chain *chain, *nc;
+ struct nft_object *obj, *ne;
struct nft_set *set, *ns;
list_for_each_entry(chain, &ctx->table->chains, list) {
@@ -735,6 +776,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
goto out;
}
+ list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
+ err = nft_delobj(ctx, obj);
+ if (err < 0)
+ goto out;
+ }
+
list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
if (!nft_is_active_next(ctx->net, chain))
continue;
@@ -881,7 +928,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
}
static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
- [NFTA_CHAIN_TABLE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
[NFTA_CHAIN_NAME] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
@@ -1196,6 +1244,83 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
}
}
+struct nft_chain_hook {
+ u32 num;
+ u32 priority;
+ const struct nf_chain_type *type;
+ struct net_device *dev;
+};
+
+static int nft_chain_parse_hook(struct net *net,
+ const struct nlattr * const nla[],
+ struct nft_af_info *afi,
+ struct nft_chain_hook *hook, bool create)
+{
+ struct nlattr *ha[NFTA_HOOK_MAX + 1];
+ const struct nf_chain_type *type;
+ struct net_device *dev;
+ int err;
+
+ err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+ nft_hook_policy);
+ if (err < 0)
+ return err;
+
+ if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+ ha[NFTA_HOOK_PRIORITY] == NULL)
+ return -EINVAL;
+
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hook->num >= afi->nhooks)
+ return -EINVAL;
+
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+
+ type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE],
+ create);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+ }
+ if (!(type->hook_mask & (1 << hook->num)))
+ return -EOPNOTSUPP;
+ if (!try_module_get(type->owner))
+ return -ENOENT;
+
+ hook->type = type;
+
+ hook->dev = NULL;
+ if (afi->flags & NFT_AF_NEEDS_DEV) {
+ char ifname[IFNAMSIZ];
+
+ if (!ha[NFTA_HOOK_DEV]) {
+ module_put(type->owner);
+ return -EOPNOTSUPP;
+ }
+
+ nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
+ dev = dev_get_by_name(net, ifname);
+ if (!dev) {
+ module_put(type->owner);
+ return -ENOENT;
+ }
+ hook->dev = dev;
+ } else if (ha[NFTA_HOOK_DEV]) {
+ module_put(type->owner);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void nft_chain_release_hook(struct nft_chain_hook *hook)
+{
+ module_put(hook->type->owner);
+ if (hook->dev != NULL)
+ dev_put(hook->dev);
+}
+
static int nf_tables_newchain(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -1206,10 +1331,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
struct nft_table *table;
struct nft_chain *chain;
struct nft_base_chain *basechain = NULL;
- struct nlattr *ha[NFTA_HOOK_MAX + 1];
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
- struct net_device *dev = NULL;
u8 policy = NF_ACCEPT;
u64 handle = 0;
unsigned int i;
@@ -1273,6 +1396,37 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
+ if (nla[NFTA_CHAIN_HOOK]) {
+ struct nft_base_chain *basechain;
+ struct nft_chain_hook hook;
+ struct nf_hook_ops *ops;
+
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ return -EBUSY;
+
+ err = nft_chain_parse_hook(net, nla, afi, &hook,
+ create);
+ if (err < 0)
+ return err;
+
+ basechain = nft_base_chain(chain);
+ if (basechain->type != hook.type) {
+ nft_chain_release_hook(&hook);
+ return -EBUSY;
+ }
+
+ for (i = 0; i < afi->nops; i++) {
+ ops = &basechain->ops[i];
+ if (ops->hooknum != hook.num ||
+ ops->priority != hook.priority ||
+ ops->dev != hook.dev) {
+ nft_chain_release_hook(&hook);
+ return -EBUSY;
+ }
+ }
+ nft_chain_release_hook(&hook);
+ }
+
if (nla[NFTA_CHAIN_HANDLE] && name) {
struct nft_chain *chain2;
@@ -1320,102 +1474,53 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
return -EOVERFLOW;
if (nla[NFTA_CHAIN_HOOK]) {
- const struct nf_chain_type *type;
+ struct nft_chain_hook hook;
struct nf_hook_ops *ops;
nf_hookfn *hookfn;
- u32 hooknum, priority;
-
- type = chain_type[family][NFT_CHAIN_T_DEFAULT];
- if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(afi,
- nla[NFTA_CHAIN_TYPE],
- create);
- if (IS_ERR(type))
- return PTR_ERR(type);
- }
- err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
- nft_hook_policy);
+ err = nft_chain_parse_hook(net, nla, afi, &hook, create);
if (err < 0)
return err;
- if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
- ha[NFTA_HOOK_PRIORITY] == NULL)
- return -EINVAL;
-
- hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
- if (hooknum >= afi->nhooks)
- return -EINVAL;
- priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
-
- if (!(type->hook_mask & (1 << hooknum)))
- return -EOPNOTSUPP;
- if (!try_module_get(type->owner))
- return -ENOENT;
- hookfn = type->hooks[hooknum];
-
- if (afi->flags & NFT_AF_NEEDS_DEV) {
- char ifname[IFNAMSIZ];
-
- if (!ha[NFTA_HOOK_DEV]) {
- module_put(type->owner);
- return -EOPNOTSUPP;
- }
-
- nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
- dev = dev_get_by_name(net, ifname);
- if (!dev) {
- module_put(type->owner);
- return -ENOENT;
- }
- } else if (ha[NFTA_HOOK_DEV]) {
- module_put(type->owner);
- return -EOPNOTSUPP;
- }
basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
if (basechain == NULL) {
- module_put(type->owner);
- if (dev != NULL)
- dev_put(dev);
+ nft_chain_release_hook(&hook);
return -ENOMEM;
}
- if (dev != NULL)
- strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+ if (hook.dev != NULL)
+ strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ);
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
if (IS_ERR(stats)) {
- module_put(type->owner);
+ nft_chain_release_hook(&hook);
kfree(basechain);
- if (dev != NULL)
- dev_put(dev);
return PTR_ERR(stats);
}
basechain->stats = stats;
} else {
stats = netdev_alloc_pcpu_stats(struct nft_stats);
if (stats == NULL) {
- module_put(type->owner);
+ nft_chain_release_hook(&hook);
kfree(basechain);
- if (dev != NULL)
- dev_put(dev);
return -ENOMEM;
}
rcu_assign_pointer(basechain->stats, stats);
}
- basechain->type = type;
+ hookfn = hook.type->hooks[hook.num];
+ basechain->type = hook.type;
chain = &basechain->chain;
for (i = 0; i < afi->nops; i++) {
ops = &basechain->ops[i];
ops->pf = family;
- ops->hooknum = hooknum;
- ops->priority = priority;
+ ops->hooknum = hook.num;
+ ops->priority = hook.priority;
ops->priv = chain;
ops->hook = afi->hooks[ops->hooknum];
- ops->dev = dev;
+ ops->dev = hook.dev;
if (hookfn)
ops->hook = hookfn;
if (afi->hook_ops_init)
@@ -1750,7 +1855,8 @@ static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
- [NFTA_RULE_TABLE] = { .type = NLA_STRING },
+ [NFTA_RULE_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_RULE_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE] = { .type = NLA_U64 },
@@ -2011,7 +2117,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
* is called on error from nf_tables_newrule().
*/
expr = nft_expr_first(rule);
- while (expr->ops && expr != nft_expr_last(rule)) {
+ while (expr != nft_expr_last(rule) && expr->ops) {
nf_tables_expr_destroy(ctx, expr);
expr = nft_expr_next(expr);
}
@@ -2339,7 +2445,8 @@ nft_select_set_ops(const struct nlattr * const nla[],
}
static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
- [NFTA_SET_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS] = { .type = NLA_U32 },
@@ -2354,6 +2461,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 },
[NFTA_SET_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
+ [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2405,6 +2513,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
}
return ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
const struct nlattr *nla,
@@ -2423,6 +2532,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
}
return ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
const char *name)
@@ -2511,9 +2621,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
goto nla_put_failure;
}
+ if (set->flags & NFT_SET_OBJECT &&
+ nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
+ goto nla_put_failure;
if (set->timeout &&
- nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout),
+ nla_put_be64(skb, NFTA_SET_TIMEOUT,
+ cpu_to_be64(jiffies_to_msecs(set->timeout)),
NFTA_SET_PAD))
goto nla_put_failure;
if (set->gc_int &&
@@ -2739,7 +2853,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
unsigned int size;
bool create;
u64 timeout;
- u32 ktype, dtype, flags, policy, gc_int;
+ u32 ktype, dtype, flags, policy, gc_int, objtype;
struct nft_set_desc desc;
unsigned char *udata;
u16 udlen;
@@ -2769,11 +2883,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
- NFT_SET_MAP | NFT_SET_EVAL))
+ NFT_SET_MAP | NFT_SET_EVAL |
+ NFT_SET_OBJECT))
return -EINVAL;
- /* Only one of both operations is supported */
- if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) ==
- (NFT_SET_MAP | NFT_SET_EVAL))
+ /* Only one of these operations is supported */
+ if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+ (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
return -EOPNOTSUPP;
}
@@ -2798,11 +2913,25 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
} else if (flags & NFT_SET_MAP)
return -EINVAL;
+ if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
+ if (!(flags & NFT_SET_OBJECT))
+ return -EINVAL;
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
+ if (objtype == NFT_OBJECT_UNSPEC ||
+ objtype > NFT_OBJECT_MAX)
+ return -EINVAL;
+ } else if (flags & NFT_SET_OBJECT)
+ return -EINVAL;
+ else
+ objtype = NFT_OBJECT_UNSPEC;
+
timeout = 0;
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT]));
+ timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+ nla[NFTA_SET_TIMEOUT])));
}
gc_int = 0;
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -2884,6 +3013,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->ktype = ktype;
set->klen = desc.klen;
set->dtype = dtype;
+ set->objtype = objtype;
set->dlen = desc.dlen;
set->flags = flags;
set->size = desc.size;
@@ -2899,12 +3029,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err2;
+ goto err3;
list_add_tail_rcu(&set->list, &table->sets);
table->use++;
return 0;
+err3:
+ ops->destroy(set);
err2:
kfree(set);
err1:
@@ -2955,9 +3087,9 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
}
static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
enum nft_registers dreg;
@@ -3003,6 +3135,7 @@ bind:
list_add_tail_rcu(&binding->list, &set->bindings);
return 0;
}
+EXPORT_SYMBOL_GPL(nf_tables_bind_set);
void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
@@ -3013,6 +3146,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
nft_is_active(ctx->net, set))
nf_tables_set_destroy(ctx, set);
}
+EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_KEY] = {
@@ -3024,6 +3158,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_EXPR] = {
.align = __alignof__(struct nft_expr),
},
+ [NFT_SET_EXT_OBJREF] = {
+ .len = sizeof(struct nft_object *),
+ .align = __alignof__(struct nft_object *),
+ },
[NFT_SET_EXT_FLAGS] = {
.len = sizeof(u8),
.align = __alignof__(u8),
@@ -3057,8 +3195,10 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
- [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING },
- [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
+ [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
@@ -3112,6 +3252,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+ nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
+ (*nft_set_ext_obj(ext))->name) < 0)
+ goto nla_put_failure;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
htonl(*nft_set_ext_flags(ext))))
@@ -3119,7 +3264,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- cpu_to_be64(*nft_set_ext_timeout(ext)),
+ cpu_to_be64(jiffies_to_msecs(
+ *nft_set_ext_timeout(ext))),
NFTA_SET_ELEM_PAD))
goto nla_put_failure;
@@ -3162,9 +3308,9 @@ struct nft_set_dump_args {
};
static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
{
struct nft_set_dump_args *args;
@@ -3176,7 +3322,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
u8 genmask = nft_genmask_cur(net);
- const struct nft_set *set;
+ struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
@@ -3388,23 +3534,25 @@ void *nft_set_elem_init(const struct nft_set *set,
memcpy(nft_set_ext_data(ext), data, set->dlen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
*nft_set_ext_expiration(ext) =
- jiffies + msecs_to_jiffies(timeout);
+ jiffies + timeout;
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;
return elem;
}
-void nft_set_elem_destroy(const struct nft_set *set, void *elem)
+void nft_set_elem_destroy(const struct nft_set *set, void *elem,
+ bool destroy_expr)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_uninit(nft_set_ext_data(ext), set->dtype);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
-
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use--;
kfree(elem);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
@@ -3426,14 +3574,16 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
}
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
- const struct nlattr *attr)
+ const struct nlattr *attr, u32 nlmsg_flags)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ u8 genmask = nft_genmask_next(ctx->net);
struct nft_data_desc d1, d2;
struct nft_set_ext_tmpl tmpl;
- struct nft_set_ext *ext;
+ struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
+ struct nft_object *obj = NULL;
struct nft_userdata *udata;
struct nft_data data;
enum nft_registers dreg;
@@ -3475,7 +3625,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT]));
+ timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+ nla[NFTA_SET_ELEM_TIMEOUT])));
} else if (set->flags & NFT_SET_TIMEOUT) {
timeout = set->timeout;
}
@@ -3495,6 +3646,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
}
+ if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
+ if (!(set->flags & NFT_SET_OBJECT)) {
+ err = -EINVAL;
+ goto err2;
+ }
+ obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+ set->objtype, genmask);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ goto err2;
+ }
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ }
+
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
err = nft_data_init(ctx, &data, sizeof(data), &d2,
nla[NFTA_SET_ELEM_DATA]);
@@ -3508,6 +3673,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
dreg = nft_type_to_reg(set->dtype);
list_for_each_entry(binding, &set->bindings, list) {
struct nft_ctx bind_ctx = {
+ .net = ctx->net,
.afi = ctx->afi,
.table = ctx->table,
.chain = (struct nft_chain *)binding->chain,
@@ -3552,20 +3718,45 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
}
+ if (obj) {
+ *nft_set_ext_obj(ext) = obj;
+ obj->use++;
+ }
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
goto err4;
ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
- err = set->ops->insert(ctx->net, set, &elem);
- if (err < 0)
+ err = set->ops->insert(ctx->net, set, &elem, &ext2);
+ if (err) {
+ if (err == -EEXIST) {
+ if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
+ memcmp(nft_set_ext_data(ext),
+ nft_set_ext_data(ext2), set->dlen) != 0) ||
+ (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
+ *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
+ err = -EBUSY;
+ else if (!(nlmsg_flags & NLM_F_EXCL))
+ err = 0;
+ }
goto err5;
+ }
+
+ if (set->size &&
+ !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
+ err = -ENFILE;
+ goto err6;
+ }
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
+err6:
+ set->ops->remove(set, &elem);
err5:
kfree(trans);
err4:
@@ -3612,15 +3803,9 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
return -EBUSY;
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- if (set->size &&
- !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact))
- return -ENFILE;
-
- err = nft_add_set_elem(&ctx, set, attr);
- if (err < 0) {
- atomic_dec(&set->nelems);
+ err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
+ if (err < 0)
break;
- }
}
return err;
}
@@ -3704,6 +3889,35 @@ err1:
return err;
}
+static int nft_flush_set(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
+{
+ struct nft_trans *trans;
+ int err;
+
+ trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
+ sizeof(struct nft_trans_elem), GFP_ATOMIC);
+ if (!trans)
+ return -ENOMEM;
+
+ if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) {
+ err = -ENOENT;
+ goto err1;
+ }
+ set->ndeact++;
+
+ nft_trans_elem_set(trans) = set;
+ nft_trans_elem(trans) = *elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+err1:
+ kfree(trans);
+ return err;
+}
+
static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -3714,9 +3928,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
- return -EINVAL;
-
err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
@@ -3728,6 +3939,18 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
return -EBUSY;
+ if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
+ struct nft_set_dump_args args = {
+ .iter = {
+ .genmask = genmask,
+ .fn = nft_flush_set,
+ },
+ };
+ set->ops->walk(&ctx, set, &args.iter);
+
+ return args.iter.err;
+ }
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
if (err < 0)
@@ -3745,7 +3968,7 @@ void nft_set_gc_batch_release(struct rcu_head *rcu)
gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
for (i = 0; i < gcb->head.cnt; i++)
- nft_set_elem_destroy(gcb->head.set, gcb->elems[i]);
+ nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
kfree(gcb);
}
EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
@@ -3763,6 +3986,503 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
}
EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
+/*
+ * Stateful objects
+ */
+
+/**
+ * nft_register_obj- register nf_tables stateful object type
+ * @obj: object type
+ *
+ * Registers the object type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_obj(struct nft_object_type *obj_type)
+{
+ if (obj_type->type == NFT_OBJECT_UNSPEC)
+ return -EINVAL;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_rcu(&obj_type->list, &nf_tables_objects);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_obj);
+
+/**
+ * nft_unregister_obj - unregister nf_tables object type
+ * @obj: object type
+ *
+ * Unregisters the object type for use with nf_tables.
+ */
+void nft_unregister_obj(struct nft_object_type *obj_type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&obj_type->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_obj);
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+ const struct nlattr *nla,
+ u32 objtype, u8 genmask)
+{
+ struct nft_object *obj;
+
+ list_for_each_entry(obj, &table->objects, list) {
+ if (!nla_strcmp(nla, obj->name) &&
+ objtype == obj->type->type &&
+ nft_active_genmask(obj, genmask))
+ return obj;
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+
+static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
+ [NFTA_OBJ_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
+ [NFTA_OBJ_NAME] = { .type = NLA_STRING,
+ .len = NFT_OBJ_MAXNAMELEN - 1 },
+ [NFTA_OBJ_TYPE] = { .type = NLA_U32 },
+ [NFTA_OBJ_DATA] = { .type = NLA_NESTED },
+};
+
+static struct nft_object *nft_obj_init(const struct nft_object_type *type,
+ const struct nlattr *attr)
+{
+ struct nlattr *tb[type->maxattr + 1];
+ struct nft_object *obj;
+ int err;
+
+ if (attr) {
+ err = nla_parse_nested(tb, type->maxattr, attr, type->policy);
+ if (err < 0)
+ goto err1;
+ } else {
+ memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
+ }
+
+ err = -ENOMEM;
+ obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL);
+ if (obj == NULL)
+ goto err1;
+
+ err = type->init((const struct nlattr * const *)tb, obj);
+ if (err < 0)
+ goto err2;
+
+ obj->type = type;
+ return obj;
+err2:
+ kfree(obj);
+err1:
+ return ERR_PTR(err);
+}
+
+static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
+ struct nft_object *obj, bool reset)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ goto nla_put_failure;
+ if (obj->type->dump(skb, obj, reset) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+{
+ const struct nft_object_type *type;
+
+ list_for_each_entry(type, &nf_tables_objects, list) {
+ if (objtype == type->type)
+ return type;
+ }
+ return NULL;
+}
+
+static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+{
+ const struct nft_object_type *type;
+
+ type = __nft_obj_type_get(objtype);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+#ifdef CONFIG_MODULES
+ if (type == NULL) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-obj-%u", objtype);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_obj_type_get(objtype))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_object_type *type;
+ u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_object *obj;
+ struct nft_ctx ctx;
+ u32 objtype;
+ int err;
+
+ if (!nla[NFTA_OBJ_TYPE] ||
+ !nla[NFTA_OBJ_NAME] ||
+ !nla[NFTA_OBJ_DATA])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ if (err != -ENOENT)
+ return err;
+
+ obj = NULL;
+ }
+
+ if (obj != NULL) {
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ return 0;
+ }
+
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+ type = nft_obj_type_get(objtype);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ goto err1;
+ }
+ obj->table = table;
+ nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
+
+ err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&obj->list, &table->objects);
+ table->use++;
+ return 0;
+err2:
+ if (obj->type->destroy)
+ obj->type->destroy(obj);
+ kfree(obj);
+err1:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, int event, u32 flags,
+ int family, const struct nft_table *table,
+ struct nft_object *obj, bool reset)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
+ nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
+ nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) ||
+ nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+struct nft_obj_filter {
+ char table[NFT_OBJ_MAXNAMELEN];
+ u32 type;
+};
+
+static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct nft_obj_filter *filter = cb->data;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_object *obj;
+ bool reset = false;
+
+ if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ reset = true;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(obj, &table->objects, list) {
+ if (!nft_is_active(net, obj))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (filter && filter->table[0] &&
+ strcmp(filter->table, table->name))
+ goto cont;
+ if (filter &&
+ filter->type != NFT_OBJECT_UNSPEC &&
+ obj->type->type != filter->type)
+ goto cont;
+
+ if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWOBJ,
+ NLM_F_MULTI | NLM_F_APPEND,
+ afi->family, table, obj, reset) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_dump_obj_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+
+ return 0;
+}
+
+static struct nft_obj_filter *
+nft_obj_filter_alloc(const struct nlattr * const nla[])
+{
+ struct nft_obj_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
+
+ if (nla[NFTA_OBJ_TABLE])
+ nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
+ NFT_TABLE_MAXNAMELEN);
+ if (nla[NFTA_OBJ_TYPE])
+ filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
+ return filter;
+}
+
+static int nf_tables_getobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_cur(net);
+ int family = nfmsg->nfgen_family;
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ struct nft_object *obj;
+ struct sk_buff *skb2;
+ bool reset = false;
+ u32 objtype;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ };
+
+ if (nla[NFTA_OBJ_TABLE] ||
+ nla[NFTA_OBJ_TYPE]) {
+ struct nft_obj_filter *filter;
+
+ filter = nft_obj_filter_alloc(nla);
+ if (IS_ERR(filter))
+ return -ENOMEM;
+
+ c.data = filter;
+ }
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ if (!nla[NFTA_OBJ_NAME] ||
+ !nla[NFTA_OBJ_TYPE])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ reset = true;
+
+ err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ family, table, obj, reset);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+ kfree_skb(skb2);
+ return err;
+
+ return 0;
+}
+
+static void nft_obj_destroy(struct nft_object *obj)
+{
+ if (obj->type->destroy)
+ obj->type->destroy(obj);
+
+ module_put(obj->type->owner);
+ kfree(obj);
+}
+
+static int nf_tables_delobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_object *obj;
+ struct nft_ctx ctx;
+ u32 objtype;
+
+ if (!nla[NFTA_OBJ_TYPE] ||
+ !nla[NFTA_OBJ_NAME])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ if (obj->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+ return nft_delobj(&ctx, obj);
+}
+
+int nft_obj_notify(struct net *net, struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ int family, int report, gfp_t gfp)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!report &&
+ !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
+ table, obj, false);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_obj_notify);
+
+static int nf_tables_obj_notify(const struct nft_ctx *ctx,
+ struct nft_object *obj, int event)
+{
+ return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
+ ctx->seq, event, ctx->afi->family, ctx->report,
+ GFP_KERNEL);
+}
+
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
@@ -3923,6 +4643,26 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_GETGEN] = {
.call = nf_tables_getgen,
},
+ [NFT_MSG_NEWOBJ] = {
+ .call_batch = nf_tables_newobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_GETOBJ] = {
+ .call = nf_tables_getobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_DELOBJ] = {
+ .call_batch = nf_tables_delobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_GETOBJ_RESET] = {
+ .call = nf_tables_getobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
};
static void nft_chain_commit_update(struct nft_trans *trans)
@@ -3963,7 +4703,10 @@ static void nf_tables_commit_release(struct nft_trans *trans)
break;
case NFT_MSG_DELSETELEM:
nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv);
+ nft_trans_elem(trans).priv, true);
+ break;
+ case NFT_MSG_DELOBJ:
+ nft_obj_destroy(nft_trans_obj(trans));
break;
}
kfree(trans);
@@ -4072,6 +4815,17 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
atomic_dec(&te->set->nelems);
te->set->ndeact--;
break;
+ case NFT_MSG_NEWOBJ:
+ nft_clear(net, nft_trans_obj(trans));
+ nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELOBJ:
+ list_del_rcu(&nft_trans_obj(trans)->list);
+ nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+ NFT_MSG_DELOBJ);
+ break;
}
}
@@ -4104,7 +4858,10 @@ static void nf_tables_abort_release(struct nft_trans *trans)
break;
case NFT_MSG_NEWSETELEM:
nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv);
+ nft_trans_elem(trans).priv, true);
+ break;
+ case NFT_MSG_NEWOBJ:
+ nft_obj_destroy(nft_trans_obj(trans));
break;
}
kfree(trans);
@@ -4186,6 +4943,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
+ case NFT_MSG_NEWOBJ:
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_obj(trans)->list);
+ break;
+ case NFT_MSG_DELOBJ:
+ trans->ctx.table->use++;
+ nft_clear(trans->ctx.net, nft_trans_obj(trans));
+ nft_trans_destroy(trans);
+ break;
}
}
@@ -4254,9 +5020,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
const struct nft_chain *chain);
static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
const struct nft_data *data;
@@ -4280,7 +5046,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
{
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
- const struct nft_set *set;
+ struct nft_set *set;
struct nft_set_binding *binding;
struct nft_set_iter iter;
@@ -4343,6 +5109,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
}
/**
+ * nft_parse_u32_check - fetch u32 attribute and check for maximum value
+ *
+ * @attr: netlink attribute to fetch value from
+ * @max: maximum value to be stored in dest
+ * @dest: pointer to the variable
+ *
+ * Parse, check and store a given u32 netlink attribute into variable.
+ * This function returns -ERANGE if the value goes over maximum value.
+ * Otherwise a 0 is returned and the attribute value is stored in the
+ * destination variable.
+ */
+int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
+{
+ u32 val;
+
+ val = ntohl(nla_get_be32(attr));
+ if (val > max)
+ return -ERANGE;
+
+ *dest = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_u32_check);
+
+/**
* nft_parse_register - parse a register value from a netlink attribute
*
* @attr: netlink attribute
@@ -4707,6 +5498,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
{
struct nft_table *table, *nt;
struct nft_chain *chain, *nc;
+ struct nft_object *obj, *ne;
struct nft_rule *rule, *nr;
struct nft_set *set, *ns;
struct nft_ctx ctx = {
@@ -4733,6 +5525,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
table->use--;
nft_set_destroy(set);
}
+ list_for_each_entry_safe(obj, ne, &table->objects, list) {
+ list_del(&obj->list);
+ table->use--;
+ nft_obj_destroy(obj);
+ }
list_for_each_entry_safe(chain, nc, &table->chains, list) {
list_del(&chain->list);
table->use--;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index fb8b5892b5ff..65dbeadcb118 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_WARNING,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -53,10 +53,10 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
nft_trace_notify(info);
- nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
- pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
- chain->table->name, chain->name, comments[type],
- rulenum);
+ nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
+ nft_in(pkt), nft_out(pkt), &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ chain->table->name, chain->name, comments[type], rulenum);
}
static inline void nft_trace_packet(struct nft_traceinfo *info,
@@ -93,12 +93,15 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb);
- else
+ else {
+ if (!pkt->tprot_set)
+ return false;
ptr = skb_network_header(skb) + pkt->xt.thoff;
+ }
ptr += priv->offset;
- if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+ if (unlikely(ptr + priv->len > skb_tail_pointer(skb)))
return false;
*dest = 0;
@@ -121,7 +124,7 @@ unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
- const struct net *net = pkt->net;
+ const struct net *net = nft_net(pkt);
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_regs regs;
@@ -175,6 +178,7 @@ next_rule:
case NF_ACCEPT:
case NF_DROP:
case NF_QUEUE:
+ case NF_STOLEN:
nft_trace_packet(&info, chain, rule,
rulenum, NFT_TRACETYPE_RULE);
return regs.verdict.code;
@@ -228,63 +232,40 @@ next_rule:
}
EXPORT_SYMBOL_GPL(nft_do_chain);
+static struct nft_expr_type *nft_basic_types[] = {
+ &nft_imm_type,
+ &nft_cmp_type,
+ &nft_lookup_type,
+ &nft_bitwise_type,
+ &nft_byteorder_type,
+ &nft_payload_type,
+ &nft_dynset_type,
+ &nft_range_type,
+};
+
int __init nf_tables_core_module_init(void)
{
- int err;
-
- err = nft_immediate_module_init();
- if (err < 0)
- goto err1;
-
- err = nft_cmp_module_init();
- if (err < 0)
- goto err2;
+ int err, i;
- err = nft_lookup_module_init();
- if (err < 0)
- goto err3;
-
- err = nft_bitwise_module_init();
- if (err < 0)
- goto err4;
-
- err = nft_byteorder_module_init();
- if (err < 0)
- goto err5;
-
- err = nft_payload_module_init();
- if (err < 0)
- goto err6;
-
- err = nft_dynset_module_init();
- if (err < 0)
- goto err7;
+ for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) {
+ err = nft_register_expr(nft_basic_types[i]);
+ if (err)
+ goto err;
+ }
return 0;
-err7:
- nft_payload_module_exit();
-err6:
- nft_byteorder_module_exit();
-err5:
- nft_bitwise_module_exit();
-err4:
- nft_lookup_module_exit();
-err3:
- nft_cmp_module_exit();
-err2:
- nft_immediate_module_exit();
-err1:
+err:
+ while (i-- > 0)
+ nft_unregister_expr(nft_basic_types[i]);
return err;
}
void nf_tables_core_module_exit(void)
{
- nft_dynset_module_exit();
- nft_payload_module_exit();
- nft_byteorder_module_exit();
- nft_bitwise_module_exit();
- nft_lookup_module_exit();
- nft_cmp_module_exit();
- nft_immediate_module_exit();
+ int i;
+
+ i = ARRAY_SIZE(nft_basic_types);
+ while (i-- > 0)
+ nft_unregister_expr(nft_basic_types[i]);
}
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 6b5f76295d3d..f713cc205669 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -82,7 +82,10 @@ static int __init nf_tables_inet_init(void)
{
int ret;
- nft_register_chain_type(&filter_inet);
+ ret = nft_register_chain_type(&filter_inet);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_inet_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_inet);
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 75d696f11045..9e2ae424b640 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -15,78 +15,6 @@
#include <net/netfilter/nf_tables_ipv4.h>
#include <net/netfilter/nf_tables_ipv6.h>
-static inline void
-nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct iphdr *iph, _iph;
- u32 len, thoff;
-
- nft_set_pktinfo(pkt, skb, state);
-
- iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
- &_iph);
- if (!iph)
- return;
-
- if (iph->ihl < 5 || iph->version != 4)
- return;
-
- len = ntohs(iph->tot_len);
- thoff = iph->ihl * 4;
- if (skb->len < len)
- return;
- else if (len < thoff)
- return;
-
- pkt->tprot = iph->protocol;
- pkt->xt.thoff = thoff;
- pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
-}
-
-static inline void
-__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_IPV6)
- struct ipv6hdr *ip6h, _ip6h;
- unsigned int thoff = 0;
- unsigned short frag_off;
- int protohdr;
- u32 pkt_len;
-
- ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
- &_ip6h);
- if (!ip6h)
- return;
-
- if (ip6h->version != 6)
- return;
-
- pkt_len = ntohs(ip6h->payload_len);
- if (pkt_len + sizeof(*ip6h) > skb->len)
- return;
-
- protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
- if (protohdr < 0)
- return;
-
- pkt->tprot = protohdr;
- pkt->xt.thoff = thoff;
- pkt->xt.fragoff = frag_off;
-#endif
-}
-
-static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- nft_set_pktinfo(pkt, skb, state);
- __nft_netdev_set_pktinfo_ipv6(pkt, skb, state);
-}
-
static unsigned int
nft_do_chain_netdev(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -95,13 +23,13 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
switch (skb->protocol) {
case htons(ETH_P_IP):
- nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
break;
case htons(ETH_P_IPV6):
- nft_netdev_set_pktinfo_ipv6(&pkt, skb, state);
+ nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
break;
default:
- nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb, state);
break;
}
@@ -221,14 +149,25 @@ static int __init nf_tables_netdev_init(void)
{
int ret;
- nft_register_chain_type(&nft_filter_chain_netdev);
- ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
- if (ret < 0) {
- nft_unregister_chain_type(&nft_filter_chain_netdev);
+ ret = nft_register_chain_type(&nft_filter_chain_netdev);
+ if (ret)
return ret;
- }
- register_netdevice_notifier(&nf_tables_netdev_notifier);
+
+ ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+ if (ret)
+ goto err1;
+
+ ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
+ if (ret)
+ goto err2;
+
return 0;
+
+err2:
+ unregister_pernet_subsys(&nf_tables_netdev_net_ops);
+err1:
+ nft_unregister_chain_type(&nft_filter_chain_netdev);
+ return ret;
}
static void __exit nf_tables_netdev_exit(void)
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index fa24a5b398b1..12eb9041dca2 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -113,20 +113,22 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
const struct sk_buff *skb = pkt->skb;
- unsigned int len = min_t(unsigned int,
- pkt->xt.thoff - skb_network_offset(skb),
- NFT_TRACETYPE_NETWORK_HSIZE);
int off = skb_network_offset(skb);
+ unsigned int len, nh_end;
+ nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
+ len = min_t(unsigned int, nh_end - skb_network_offset(skb),
+ NFT_TRACETYPE_NETWORK_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
return -1;
- len = min_t(unsigned int, skb->len - pkt->xt.thoff,
- NFT_TRACETYPE_TRANSPORT_HSIZE);
-
- if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
- pkt->xt.thoff, len))
- return -1;
+ if (pkt->tprot_set) {
+ len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+ NFT_TRACETYPE_TRANSPORT_HSIZE);
+ if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
+ pkt->xt.thoff, len))
+ return -1;
+ }
if (!skb_mac_header_was_set(skb))
return 0;
@@ -169,7 +171,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
unsigned int size;
int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE;
- if (!nfnetlink_has_listeners(pkt->net, NFNLGRP_NFTRACE))
+ if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
return;
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
@@ -205,7 +207,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
- if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(pkt->pf)))
+ if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
@@ -247,7 +249,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
goto nla_put_failure;
if (!info->packet_dumped) {
- if (nf_trace_fill_dev_info(skb, pkt->in, pkt->out))
+ if (nf_trace_fill_dev_info(skb, nft_in(pkt), nft_out(pkt)))
goto nla_put_failure;
if (nf_trace_fill_pkt_info(skb, pkt))
@@ -256,7 +258,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
}
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, pkt->net, 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC);
+ nfnetlink_send(skb, nft_net(pkt), 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC);
return;
nla_put_failure:
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 2278d9ab723b..a09fa9fd8f3d 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -22,7 +22,7 @@
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/init.h>
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index e924e95fcc7f..3b79f34b5095 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -43,7 +43,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
if (help == NULL)
return NF_DROP;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
if (helper == NULL)
return NF_DROP;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6577db524ef6..08247bf7d7b8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -80,7 +80,7 @@ struct nfulnl_instance {
#define INSTANCE_BUCKETS 16
-static int nfnl_log_net_id __read_mostly;
+static unsigned int nfnl_log_net_id __read_mostly;
struct nfnl_log_net {
spinlock_t instances_lock;
@@ -330,7 +330,7 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
* message. WARNING: has to be <= 128k due to slab restrictions */
n = max(inst_size, pkt_size);
- skb = alloc_skb(n, GFP_ATOMIC);
+ skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
if (!skb) {
if (n > pkt_size) {
/* try to allocate only as much as we need for current
@@ -442,7 +442,9 @@ __build_packet_message(struct nfnl_log_net *log,
if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
htonl(indev->ifindex)) ||
/* this is the bridge group "brX" */
- /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ /* rcu_read_lock()ed by nf_hook_thresh or
+ * nf_log_packet.
+ */
nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
goto nla_put_failure;
@@ -477,7 +479,9 @@ __build_packet_message(struct nfnl_log_net *log,
if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
htonl(outdev->ifindex)) ||
/* this is the bridge group "brX" */
- /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ /* rcu_read_lock()ed by nf_hook_thresh or
+ * nf_log_packet.
+ */
nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
goto nla_put_failure;
@@ -534,7 +538,7 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (skb->tstamp.tv64) {
+ if (skb->tstamp) {
struct nfulnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
@@ -1148,6 +1152,7 @@ MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
+MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */
module_init(nfnetlink_log_init);
module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f49f45081acb..3ee0b8a000a4 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -69,7 +69,7 @@ struct nfqnl_instance {
* Following fields are dirtied for each queued packet,
* keep them in same cache line if possible.
*/
- spinlock_t lock;
+ spinlock_t lock ____cacheline_aligned_in_smp;
unsigned int queue_total;
unsigned int id_sequence; /* 'sequence' of pkt ids */
struct list_head queue_list; /* packets in queue */
@@ -77,7 +77,7 @@ struct nfqnl_instance {
typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
-static int nfnl_queue_net_id __read_mostly;
+static unsigned int nfnl_queue_net_id __read_mostly;
#define INSTANCE_BUCKETS 16
struct nfnl_queue_net {
@@ -384,7 +384,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
- if (entskb->tstamp.tv64)
+ if (entskb->tstamp)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
size += nfqnl_get_bridge_size(entry);
@@ -555,7 +555,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entskb->tstamp.tv64) {
+ if (entskb->tstamp) {
struct nfqnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
@@ -740,7 +740,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
- /* rcu_read_lock()ed by nf_hook_slow() */
+ /* rcu_read_lock()ed by nf_hook_thresh */
queue = instance_lookup(q, queuenum);
if (!queue)
return -ESRCH;
@@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = {
.notifier_call = nfqnl_rcv_dev_event,
};
-static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr)
+static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
{
- return entry->elem == (struct nf_hook_ops *)ops_ptr;
+ return rcu_access_pointer(entry->hook) ==
+ (struct nf_hook_entry *)entry_ptr;
}
-static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook)
+static void nfqnl_nf_hook_drop(struct net *net,
+ const struct nf_hook_entry *hook)
{
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int i;
@@ -1522,9 +1524,16 @@ static int __init nfnetlink_queue_init(void)
goto cleanup_netlink_notifier;
}
- register_netdevice_notifier(&nfqnl_dev_notifier);
+ status = register_netdevice_notifier(&nfqnl_dev_notifier);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register netdevice notifier\n");
+ goto cleanup_netlink_subsys;
+ }
+
return status;
+cleanup_netlink_subsys:
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
unregister_pernet_subsys(&nfnl_queue_net_ops);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index d71cc18fa35d..877d9acd91ef 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
{
struct nft_bitwise *priv = nft_expr_priv(expr);
struct nft_data_desc d1, d2;
+ u32 len;
int err;
if (tb[NFTA_BITWISE_SREG] == NULL ||
@@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
tb[NFTA_BITWISE_XOR] == NULL)
return -EINVAL;
- priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
+
+ priv->len = len;
+
priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
err = nft_validate_register_load(priv->sreg, priv->len);
if (err < 0)
@@ -115,7 +121,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_bitwise_type;
static const struct nft_expr_ops nft_bitwise_ops = {
.type = &nft_bitwise_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
@@ -124,20 +129,10 @@ static const struct nft_expr_ops nft_bitwise_ops = {
.dump = nft_bitwise_dump,
};
-static struct nft_expr_type nft_bitwise_type __read_mostly = {
+struct nft_expr_type nft_bitwise_type __read_mostly = {
.name = "bitwise",
.ops = &nft_bitwise_ops,
.policy = nft_bitwise_policy,
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_bitwise_module_init(void)
-{
- return nft_register_expr(&nft_bitwise_type);
-}
-
-void nft_bitwise_module_exit(void)
-{
- nft_unregister_expr(&nft_bitwise_type);
-}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index b78c28ba465f..13d4e421a6b3 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_byteorder *priv = nft_expr_priv(expr);
+ u32 size, len;
int err;
if (tb[NFTA_BYTEORDER_SREG] == NULL ||
@@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+ err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size);
+ if (err < 0)
+ return err;
+
+ priv->size = size;
+
switch (priv->size) {
case 2:
case 4:
@@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
}
priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
- priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
+
+ priv->len = len;
+
err = nft_validate_register_load(priv->sreg, priv->len);
if (err < 0)
return err;
@@ -158,7 +169,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_byteorder_type;
static const struct nft_expr_ops nft_byteorder_ops = {
.type = &nft_byteorder_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
@@ -167,20 +177,10 @@ static const struct nft_expr_ops nft_byteorder_ops = {
.dump = nft_byteorder_dump,
};
-static struct nft_expr_type nft_byteorder_type __read_mostly = {
+struct nft_expr_type nft_byteorder_type __read_mostly = {
.name = "byteorder",
.ops = &nft_byteorder_ops,
.policy = nft_byteorder_policy,
.maxattr = NFTA_BYTEORDER_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_byteorder_module_init(void)
-{
- return nft_register_expr(&nft_byteorder_type);
-}
-
-void nft_byteorder_module_exit(void)
-{
- nft_unregister_expr(&nft_byteorder_type);
-}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index e25b35d70e4d..2b96effeadc1 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -107,7 +107,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_cmp_type;
static const struct nft_expr_ops nft_cmp_ops = {
.type = &nft_cmp_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
@@ -208,20 +207,10 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return &nft_cmp_ops;
}
-static struct nft_expr_type nft_cmp_type __read_mostly = {
+struct nft_expr_type nft_cmp_type __read_mostly = {
.name = "cmp",
.select_ops = nft_cmp_select_ops,
.policy = nft_cmp_policy,
.maxattr = NFTA_CMP_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_cmp_module_init(void)
-{
- return nft_register_expr(&nft_cmp_type);
-}
-
-void nft_cmp_module_exit(void)
-{
- nft_unregister_expr(&nft_cmp_type);
-}
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 77db8358ab14..7f8422213341 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -18,105 +18,197 @@
#include <net/netfilter/nf_tables.h>
struct nft_counter {
- u64 bytes;
- u64 packets;
-};
-
-struct nft_counter_percpu {
- struct nft_counter counter;
- struct u64_stats_sync syncp;
+ s64 bytes;
+ s64 packets;
};
struct nft_counter_percpu_priv {
- struct nft_counter_percpu __percpu *counter;
+ struct nft_counter __percpu *counter;
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
+
+static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- struct nft_counter_percpu *this_cpu;
+ struct nft_counter *this_cpu;
+ seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- u64_stats_update_begin(&this_cpu->syncp);
- this_cpu->counter.bytes += pkt->skb->len;
- this_cpu->counter.packets++;
- u64_stats_update_end(&this_cpu->syncp);
+ myseq = this_cpu_ptr(&nft_counter_seq);
+
+ write_seqcount_begin(myseq);
+
+ this_cpu->bytes += pkt->skb->len;
+ this_cpu->packets++;
+
+ write_seqcount_end(myseq);
local_bh_enable();
}
-static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
+static inline void nft_counter_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ nft_counter_do_eval(priv, regs, pkt);
+}
+
+static int nft_counter_do_init(const struct nlattr * const tb[],
+ struct nft_counter_percpu_priv *priv)
+{
+ struct nft_counter __percpu *cpu_stats;
+ struct nft_counter *this_cpu;
+
+ cpu_stats = alloc_percpu(struct nft_counter);
+ if (cpu_stats == NULL)
+ return -ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ if (tb[NFTA_COUNTER_PACKETS]) {
+ this_cpu->packets =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ }
+ if (tb[NFTA_COUNTER_BYTES]) {
+ this_cpu->bytes =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ }
+ preempt_enable();
+ priv->counter = cpu_stats;
+ return 0;
+}
+
+static int nft_counter_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ return nft_counter_do_init(tb, priv);
+}
+
+static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
+{
+ free_percpu(priv->counter);
+}
+
+static void nft_counter_obj_destroy(struct nft_object *obj)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ nft_counter_do_destroy(priv);
+}
+
+static void nft_counter_reset(struct nft_counter_percpu_priv __percpu *priv,
struct nft_counter *total)
{
- const struct nft_counter_percpu *cpu_stats;
+ struct nft_counter *this_cpu;
+
+ local_bh_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ this_cpu->packets -= total->packets;
+ this_cpu->bytes -= total->bytes;
+ local_bh_enable();
+}
+
+static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
+ struct nft_counter *total)
+{
+ struct nft_counter *this_cpu;
+ const seqcount_t *myseq;
u64 bytes, packets;
unsigned int seq;
int cpu;
memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- cpu_stats = per_cpu_ptr(counter, cpu);
+ myseq = per_cpu_ptr(&nft_counter_seq, cpu);
+ this_cpu = per_cpu_ptr(priv->counter, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
- bytes = cpu_stats->counter.bytes;
- packets = cpu_stats->counter.packets;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ seq = read_seqcount_begin(myseq);
+ bytes = this_cpu->bytes;
+ packets = this_cpu->packets;
+ } while (read_seqcount_retry(myseq, seq));
- total->packets += packets;
- total->bytes += bytes;
+ total->bytes += bytes;
+ total->packets += packets;
}
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_do_dump(struct sk_buff *skb,
+ struct nft_counter_percpu_priv *priv,
+ bool reset)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
struct nft_counter total;
- nft_counter_fetch(priv->counter, &total);
+ nft_counter_fetch(priv, &total);
if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
NFTA_COUNTER_PAD) ||
nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets),
NFTA_COUNTER_PAD))
goto nla_put_failure;
+
+ if (reset)
+ nft_counter_reset(priv, &total);
+
return 0;
nla_put_failure:
return -1;
}
+static int nft_counter_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ return nft_counter_do_dump(skb, priv, reset);
+}
+
static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
+static struct nft_object_type nft_counter_obj __read_mostly = {
+ .type = NFT_OBJECT_COUNTER,
+ .size = sizeof(struct nft_counter_percpu_priv),
+ .maxattr = NFTA_COUNTER_MAX,
+ .policy = nft_counter_policy,
+ .eval = nft_counter_obj_eval,
+ .init = nft_counter_obj_init,
+ .destroy = nft_counter_obj_destroy,
+ .dump = nft_counter_obj_dump,
+ .owner = THIS_MODULE,
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+
+ nft_counter_do_eval(priv, regs, pkt);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+
+ return nft_counter_do_dump(skb, priv, false);
+}
+
static int nft_counter_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- struct nft_counter_percpu __percpu *cpu_stats;
- struct nft_counter_percpu *this_cpu;
-
- cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
- if (cpu_stats == NULL)
- return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->counter.packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
- }
- if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->counter.bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
- }
- preempt_enable();
- priv->counter = cpu_stats;
- return 0;
+ return nft_counter_do_init(tb, priv);
}
static void nft_counter_destroy(const struct nft_ctx *ctx,
@@ -124,28 +216,27 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- free_percpu(priv->counter);
+ nft_counter_do_destroy(priv);
}
static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
- struct nft_counter_percpu __percpu *cpu_stats;
- struct nft_counter_percpu *this_cpu;
+ struct nft_counter __percpu *cpu_stats;
+ struct nft_counter *this_cpu;
struct nft_counter total;
- nft_counter_fetch(priv->counter, &total);
+ nft_counter_fetch(priv, &total);
- cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
- GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
if (cpu_stats == NULL)
return -ENOMEM;
preempt_disable();
this_cpu = this_cpu_ptr(cpu_stats);
- this_cpu->counter.packets = total.packets;
- this_cpu->counter.bytes = total.bytes;
+ this_cpu->packets = total.packets;
+ this_cpu->bytes = total.bytes;
preempt_enable();
priv_clone->counter = cpu_stats;
@@ -174,12 +265,29 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
static int __init nft_counter_module_init(void)
{
- return nft_register_expr(&nft_counter_type);
+ int cpu, err;
+
+ for_each_possible_cpu(cpu)
+ seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
+
+ err = nft_register_obj(&nft_counter_obj);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_counter_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_counter_obj);
+ return err;
}
static void __exit nft_counter_module_exit(void)
{
nft_unregister_expr(&nft_counter_type);
+ nft_unregister_obj(&nft_counter_obj);
}
module_init(nft_counter_module_init);
@@ -188,3 +296,4 @@ module_exit(nft_counter_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("counter");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 51e180f2a003..e6baeaebe653 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -128,15 +129,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
memcpy(dest, &count, sizeof(count));
return;
}
+ case NFT_CT_L3PROTOCOL:
+ *dest = nf_ct_l3num(ct);
+ return;
+ case NFT_CT_PROTOCOL:
+ *dest = nf_ct_protonum(ct);
+ return;
default:
break;
}
tuple = &ct->tuplehash[priv->dir].tuple;
switch (priv->key) {
- case NFT_CT_L3PROTOCOL:
- *dest = nf_ct_l3num(ct);
- return;
case NFT_CT_SRC:
memcpy(dest, tuple->src.u3.all,
nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
@@ -145,9 +149,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
memcpy(dest, tuple->dst.u3.all,
nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
return;
- case NFT_CT_PROTOCOL:
- *dest = nf_ct_protonum(ct);
- return;
case NFT_CT_PROTO_SRC:
*dest = (__force __u16)tuple->src.u.all;
return;
@@ -207,37 +208,37 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
-static int nft_ct_l3proto_try_module_get(uint8_t family)
+static int nft_ct_netns_get(struct net *net, uint8_t family)
{
int err;
if (family == NFPROTO_INET) {
- err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4);
+ err = nf_ct_netns_get(net, NFPROTO_IPV4);
if (err < 0)
goto err1;
- err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6);
+ err = nf_ct_netns_get(net, NFPROTO_IPV6);
if (err < 0)
goto err2;
} else {
- err = nf_ct_l3proto_try_module_get(family);
+ err = nf_ct_netns_get(net, family);
if (err < 0)
goto err1;
}
return 0;
err2:
- nf_ct_l3proto_module_put(NFPROTO_IPV4);
+ nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
return err;
}
-static void nft_ct_l3proto_module_put(uint8_t family)
+static void nft_ct_netns_put(struct net *net, uint8_t family)
{
if (family == NFPROTO_INET) {
- nf_ct_l3proto_module_put(NFPROTO_IPV4);
- nf_ct_l3proto_module_put(NFPROTO_IPV6);
+ nf_ct_netns_put(net, NFPROTO_IPV4);
+ nf_ct_netns_put(net, NFPROTO_IPV6);
} else
- nf_ct_l3proto_module_put(family);
+ nf_ct_netns_put(net, family);
}
static int nft_ct_get_init(const struct nft_ctx *ctx,
@@ -283,8 +284,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
case NFT_CT_L3PROTOCOL:
case NFT_CT_PROTOCOL:
- if (tb[NFTA_CT_DIRECTION] == NULL)
- return -EINVAL;
+ /* For compatibility, do not report error if NFTA_CT_DIRECTION
+ * attribute is specified.
+ */
len = sizeof(u8);
break;
case NFT_CT_SRC:
@@ -340,7 +342,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ err = nft_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
return err;
@@ -363,6 +365,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
len = FIELD_SIZEOF(struct nf_conn, mark);
break;
#endif
@@ -386,7 +390,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
if (err < 0)
goto err1;
- err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ err = nft_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
goto err1;
@@ -401,7 +405,7 @@ err1:
static void nft_ct_get_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- nft_ct_l3proto_module_put(ctx->afi->family);
+ nf_ct_netns_put(ctx->net, ctx->afi->family);
}
static void nft_ct_set_destroy(const struct nft_ctx *ctx,
@@ -419,7 +423,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
break;
}
- nft_ct_l3proto_module_put(ctx->afi->family);
+ nft_ct_netns_put(ctx->net, ctx->afi->family);
}
static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -432,8 +436,6 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
switch (priv->key) {
- case NFT_CT_L3PROTOCOL:
- case NFT_CT_PROTOCOL:
case NFT_CT_SRC:
case NFT_CT_DST:
case NFT_CT_PROTO_SRC:
@@ -517,15 +519,61 @@ static struct nft_expr_type nft_ct_type __read_mostly = {
.owner = THIS_MODULE,
};
+static void nft_notrack_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct sk_buff *skb = pkt->skb;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ /* Previously seen (loopback or untracked)? Ignore. */
+ if (ct)
+ return;
+
+ ct = nf_ct_untracked_get();
+ atomic_inc(&ct->ct_general.use);
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+}
+
+static struct nft_expr_type nft_notrack_type;
+static const struct nft_expr_ops nft_notrack_ops = {
+ .type = &nft_notrack_type,
+ .size = NFT_EXPR_SIZE(0),
+ .eval = nft_notrack_eval,
+};
+
+static struct nft_expr_type nft_notrack_type __read_mostly = {
+ .name = "notrack",
+ .ops = &nft_notrack_ops,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_ct_module_init(void)
{
+ int err;
+
BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE);
- return nft_register_expr(&nft_ct_type);
+ err = nft_register_expr(&nft_ct_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_notrack_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_ct_type);
+ return err;
}
static void __exit nft_ct_module_exit(void)
{
+ nft_unregister_expr(&nft_notrack_type);
nft_unregister_expr(&nft_ct_type);
}
@@ -535,3 +583,4 @@ module_exit(nft_ct_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("ct");
+MODULE_ALIAS_NFT_EXPR("notrack");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 0af26699bf04..049ad2d9ee66 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -22,6 +22,7 @@ struct nft_dynset {
enum nft_dynset_ops op:8;
enum nft_registers sreg_key:8;
enum nft_registers sreg_data:8;
+ bool invert;
u64 timeout;
struct nft_expr *expr;
struct nft_set_binding binding;
@@ -43,18 +44,22 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
&regs->data[priv->sreg_key],
&regs->data[priv->sreg_data],
timeout, GFP_ATOMIC);
- if (elem == NULL) {
- if (set->size)
- atomic_dec(&set->nelems);
- return NULL;
- }
+ if (elem == NULL)
+ goto err1;
ext = nft_set_elem_ext(set, elem);
if (priv->expr != NULL &&
nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
- return NULL;
+ goto err2;
return elem;
+
+err2:
+ nft_set_elem_destroy(set, elem, false);
+err1:
+ if (set->size)
+ atomic_dec(&set->nelems);
+ return NULL;
}
static void nft_dynset_eval(const struct nft_expr *expr,
@@ -82,20 +87,26 @@ static void nft_dynset_eval(const struct nft_expr *expr,
if (sexpr != NULL)
sexpr->ops->eval(sexpr, regs, pkt);
+
+ if (priv->invert)
+ regs->verdict.code = NFT_BREAK;
return;
}
out:
- regs->verdict.code = NFT_BREAK;
+ if (!priv->invert)
+ regs->verdict.code = NFT_BREAK;
}
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
- [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING },
+ [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
[NFTA_DYNSET_OP] = { .type = NLA_U32 },
[NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
[NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
+ [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
};
static int nft_dynset_init(const struct nft_ctx *ctx,
@@ -113,6 +124,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
tb[NFTA_DYNSET_SREG_KEY] == NULL)
return -EINVAL;
+ if (tb[NFTA_DYNSET_FLAGS]) {
+ u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
+
+ if (flags & ~NFT_DYNSET_F_INV)
+ return -EINVAL;
+ if (flags & NFT_DYNSET_F_INV)
+ priv->invert = true;
+ }
+
set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
genmask);
if (IS_ERR(set)) {
@@ -124,6 +144,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return PTR_ERR(set);
}
+ if (set->ops->update == NULL)
+ return -EOPNOTSUPP;
+
if (set->flags & NFT_SET_CONSTANT)
return -EBUSY;
@@ -143,7 +166,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = be64_to_cpu(nla_get_be64(tb[NFTA_DYNSET_TIMEOUT]));
+ timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+ tb[NFTA_DYNSET_TIMEOUT])));
}
priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
@@ -220,6 +244,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
+ u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
goto nla_put_failure;
@@ -230,18 +255,20 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout),
+ if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT,
+ cpu_to_be64(jiffies_to_msecs(priv->timeout)),
NFTA_DYNSET_PAD))
goto nla_put_failure;
if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_dynset_type;
static const struct nft_expr_ops nft_dynset_ops = {
.type = &nft_dynset_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)),
@@ -251,20 +278,10 @@ static const struct nft_expr_ops nft_dynset_ops = {
.dump = nft_dynset_dump,
};
-static struct nft_expr_type nft_dynset_type __read_mostly = {
+struct nft_expr_type nft_dynset_type __read_mostly = {
.name = "dynset",
.ops = &nft_dynset_ops,
.policy = nft_dynset_policy,
.maxattr = NFTA_DYNSET_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_dynset_module_init(void)
-{
- return nft_register_expr(&nft_dynset_type);
-}
-
-void nft_dynset_module_exit(void)
-{
- nft_unregister_expr(&nft_dynset_type);
-}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 82c264e40278..47beb3abcc9d 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -60,6 +60,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
{
struct nft_exthdr *priv = nft_expr_priv(expr);
u32 offset, len;
+ int err;
if (tb[NFTA_EXTHDR_DREG] == NULL ||
tb[NFTA_EXTHDR_TYPE] == NULL ||
@@ -67,11 +68,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
tb[NFTA_EXTHDR_LEN] == NULL)
return -EINVAL;
- offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
- len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
+ if (err < 0)
+ return err;
- if (offset > U8_MAX || len > U8_MAX)
- return -ERANGE;
+ err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
new file mode 100644
index 000000000000..29a4906adc27
--- /dev/null
+++ b/net/netfilter/nft_fib.c
@@ -0,0 +1,159 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Generic part shared by ipv4 and ipv6 backends.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
+ [NFTA_FIB_DREG] = { .type = NLA_U32 },
+ [NFTA_FIB_RESULT] = { .type = NLA_U32 },
+ [NFTA_FIB_FLAGS] = { .type = NLA_U32 },
+};
+EXPORT_SYMBOL(nft_fib_policy);
+
+#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
+ NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)
+
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF: /* fallthrough */
+ case NFT_FIB_RESULT_OIFNAME:
+ hooks = (1 << NF_INET_PRE_ROUTING);
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ if (priv->flags & NFTA_FIB_F_IIF)
+ hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_FORWARD);
+ else
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING);
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+EXPORT_SYMBOL_GPL(nft_fib_validate);
+
+int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ if (!tb[NFTA_FIB_DREG] || !tb[NFTA_FIB_RESULT] || !tb[NFTA_FIB_FLAGS])
+ return -EINVAL;
+
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS]));
+
+ if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL))
+ return -EINVAL;
+
+ if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) ==
+ (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR))
+ return -EINVAL;
+ if ((priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) ==
+ (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF))
+ return -EINVAL;
+ if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0)
+ return -EINVAL;
+
+ priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+ priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ if (priv->flags & NFTA_FIB_F_OIF)
+ return -EINVAL;
+ len = sizeof(int);
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ if (priv->flags & NFTA_FIB_F_OIF)
+ return -EINVAL;
+ len = IFNAMSIZ;
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+ if (err < 0)
+ return err;
+
+ return nft_fib_validate(ctx, expr, NULL);
+}
+EXPORT_SYMBOL_GPL(nft_fib_init);
+
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_FIB_DREG, priv->dreg))
+ return -1;
+
+ if (nla_put_be32(skb, NFTA_FIB_RESULT, htonl(priv->result)))
+ return -1;
+
+ if (nla_put_be32(skb, NFTA_FIB_FLAGS, htonl(priv->flags)))
+ return -1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_fib_dump);
+
+void nft_fib_store_result(void *reg, enum nft_fib_result r,
+ const struct nft_pktinfo *pkt, int index)
+{
+ struct net_device *dev;
+ u32 *dreg = reg;
+
+ switch (r) {
+ case NFT_FIB_RESULT_OIF:
+ *dreg = index;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ dev = dev_get_by_index_rcu(nft_net(pkt), index);
+ strncpy(reg, dev ? dev->name : "", IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ *dreg = 0;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_fib_store_result);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
new file mode 100644
index 000000000000..9120fc7228f4
--- /dev/null
+++ b/net/netfilter/nft_fib_inet.c
@@ -0,0 +1,82 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+#include <net/netfilter/nft_fib.h>
+
+static void nft_fib_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ case NFT_FIB_RESULT_OIFNAME:
+ return nft_fib4_eval(expr, regs, pkt);
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return nft_fib4_eval_type(expr, regs, pkt);
+ }
+ break;
+ case NFPROTO_IPV6:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ case NFT_FIB_RESULT_OIFNAME:
+ return nft_fib6_eval(expr, regs, pkt);
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return nft_fib6_eval_type(expr, regs, pkt);
+ }
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_fib_inet_type;
+static const struct nft_expr_ops nft_fib_inet_ops = {
+ .type = &nft_fib_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib_inet_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static struct nft_expr_type nft_fib_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "fib",
+ .ops = &nft_fib_inet_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib_inet_module_init(void)
+{
+ return nft_register_expr(&nft_fib_inet_type);
+}
+
+static void __exit nft_fib_inet_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib_inet_type);
+}
+
+module_init(nft_fib_inet_module_init);
+module_exit(nft_fib_inet_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(1, "fib");
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 763ebc3e0b2b..ce13a50b9189 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -26,8 +26,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
- nf_dup_netdev_egress(pkt, oif);
- regs->verdict.code = NF_DROP;
+ nf_fwd_netdev_egress(pkt, oif);
+ regs->verdict.code = NF_STOLEN;
}
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 564fa7929ed5..eb2721af898d 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,395 +1,151 @@
/*
- * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/log2.h>
-#include <linux/jhash.h>
#include <linux/netlink.h>
-#include <linux/workqueue.h>
-#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
-
-/* We target a hash table size of 4, element hint is 75% of final size */
-#define NFT_HASH_ELEMENT_HINT 3
+#include <net/netfilter/nf_tables_core.h>
+#include <linux/jhash.h>
struct nft_hash {
- struct rhashtable ht;
- struct delayed_work gc_work;
-};
-
-struct nft_hash_elem {
- struct rhash_head node;
- struct nft_set_ext ext;
-};
-
-struct nft_hash_cmp_arg {
- const struct nft_set *set;
- const u32 *key;
- u8 genmask;
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ u8 len;
+ u32 modulus;
+ u32 seed;
+ u32 offset;
};
-static const struct rhashtable_params nft_hash_params;
-
-static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
-{
- const struct nft_hash_cmp_arg *arg = data;
-
- return jhash(arg->key, len, seed);
-}
-
-static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
-{
- const struct nft_hash_elem *he = data;
-
- return jhash(nft_set_ext_key(&he->ext), len, seed);
-}
-
-static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
- const void *ptr)
-{
- const struct nft_hash_cmp_arg *x = arg->key;
- const struct nft_hash_elem *he = ptr;
-
- if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
- return 1;
- if (nft_set_elem_expired(&he->ext))
- return 1;
- if (!nft_set_elem_active(&he->ext, x->genmask))
- return 1;
- return 0;
-}
-
-static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
-{
- struct nft_hash *priv = nft_set_priv(set);
- const struct nft_hash_elem *he;
- struct nft_hash_cmp_arg arg = {
- .genmask = nft_genmask_cur(net),
- .set = set,
- .key = key,
- };
-
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
- if (he != NULL)
- *ext = &he->ext;
-
- return !!he;
-}
-
-static bool nft_hash_update(struct nft_set *set, const u32 *key,
- void *(*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *regs),
- const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext)
-{
- struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *he;
- struct nft_hash_cmp_arg arg = {
- .genmask = NFT_GENMASK_ANY,
- .set = set,
- .key = key,
- };
-
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
- if (he != NULL)
- goto out;
-
- he = new(set, expr, regs);
- if (he == NULL)
- goto err1;
- if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
- nft_hash_params))
- goto err2;
-out:
- *ext = &he->ext;
- return true;
-
-err2:
- nft_set_elem_destroy(set, he);
-err1:
- return false;
-}
-
-static int nft_hash_insert(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
-{
- struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *he = elem->priv;
- struct nft_hash_cmp_arg arg = {
- .genmask = nft_genmask_next(net),
- .set = set,
- .key = elem->key.val.data,
- };
-
- return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
- nft_hash_params);
-}
-
-static void nft_hash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
-{
- struct nft_hash_elem *he = elem->priv;
-
- nft_set_elem_change_active(net, set, &he->ext);
- nft_set_elem_clear_busy(&he->ext);
-}
-
-static void *nft_hash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
-{
- struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *he;
- struct nft_hash_cmp_arg arg = {
- .genmask = nft_genmask_next(net),
- .set = set,
- .key = elem->key.val.data,
- };
-
- rcu_read_lock();
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
- if (he != NULL) {
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext))
- nft_set_elem_change_active(net, set, &he->ext);
- else
- he = NULL;
- }
- rcu_read_unlock();
-
- return he;
-}
-
-static void nft_hash_remove(const struct nft_set *set,
- const struct nft_set_elem *elem)
+static void nft_hash_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash *priv = nft_expr_priv(expr);
+ const void *data = &regs->data[priv->sreg];
+ u32 h;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+ h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+ regs->data[priv->dreg] = h + priv->offset;
}
-static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
- struct nft_set_iter *iter)
-{
- struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *he;
- struct rhashtable_iter hti;
- struct nft_set_elem elem;
- int err;
-
- err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
- iter->err = err;
- if (err)
- return;
-
- err = rhashtable_walk_start(&hti);
- if (err && err != -EAGAIN) {
- iter->err = err;
- goto out;
- }
-
- while ((he = rhashtable_walk_next(&hti))) {
- if (IS_ERR(he)) {
- err = PTR_ERR(he);
- if (err != -EAGAIN) {
- iter->err = err;
- goto out;
- }
-
- continue;
- }
-
- if (iter->count < iter->skip)
- goto cont;
- if (nft_set_elem_expired(&he->ext))
- goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
-
- elem.priv = he;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
- if (iter->err < 0)
- goto out;
-
-cont:
- iter->count++;
- }
-
-out:
- rhashtable_walk_stop(&hti);
- rhashtable_walk_exit(&hti);
-}
-
-static void nft_hash_gc(struct work_struct *work)
-{
- struct nft_set *set;
- struct nft_hash_elem *he;
- struct nft_hash *priv;
- struct nft_set_gc_batch *gcb = NULL;
- struct rhashtable_iter hti;
- int err;
-
- priv = container_of(work, struct nft_hash, gc_work.work);
- set = nft_set_container_of(priv);
-
- err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
- if (err)
- goto schedule;
-
- err = rhashtable_walk_start(&hti);
- if (err && err != -EAGAIN)
- goto out;
-
- while ((he = rhashtable_walk_next(&hti))) {
- if (IS_ERR(he)) {
- if (PTR_ERR(he) != -EAGAIN)
- goto out;
- continue;
- }
-
- if (!nft_set_elem_expired(&he->ext))
- continue;
- if (nft_set_elem_mark_busy(&he->ext))
- continue;
-
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb == NULL)
- goto out;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, he);
- }
-out:
- rhashtable_walk_stop(&hti);
- rhashtable_walk_exit(&hti);
-
- nft_set_gc_batch_complete(gcb);
-schedule:
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
-}
-
-static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
-{
- return sizeof(struct nft_hash);
-}
-
-static const struct rhashtable_params nft_hash_params = {
- .head_offset = offsetof(struct nft_hash_elem, node),
- .hashfn = nft_hash_key,
- .obj_hashfn = nft_hash_obj,
- .obj_cmpfn = nft_hash_cmp,
- .automatic_shrinking = true,
+static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
+ [NFTA_HASH_SREG] = { .type = NLA_U32 },
+ [NFTA_HASH_DREG] = { .type = NLA_U32 },
+ [NFTA_HASH_LEN] = { .type = NLA_U32 },
+ [NFTA_HASH_MODULUS] = { .type = NLA_U32 },
+ [NFTA_HASH_SEED] = { .type = NLA_U32 },
+ [NFTA_HASH_OFFSET] = { .type = NLA_U32 },
};
-static int nft_hash_init(const struct nft_set *set,
- const struct nft_set_desc *desc,
+static int nft_hash_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_hash *priv = nft_set_priv(set);
- struct rhashtable_params params = nft_hash_params;
+ struct nft_hash *priv = nft_expr_priv(expr);
+ u32 len;
int err;
- params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT;
- params.key_len = set->klen;
+ if (!tb[NFTA_HASH_SREG] ||
+ !tb[NFTA_HASH_DREG] ||
+ !tb[NFTA_HASH_LEN] ||
+ !tb[NFTA_HASH_MODULUS])
+ return -EINVAL;
+
+ if (tb[NFTA_HASH_OFFSET])
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- err = rhashtable_init(&priv->ht, &params);
+ priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
+ priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
+
+ err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
if (err < 0)
return err;
-
- INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
- if (set->flags & NFT_SET_TIMEOUT)
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
+ if (len == 0)
+ return -ERANGE;
+
+ priv->len = len;
+
+ priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
+ if (priv->modulus <= 1)
+ return -ERANGE;
+
+ if (priv->offset + priv->modulus - 1 < priv->offset)
+ return -EOVERFLOW;
+
+ if (tb[NFTA_HASH_SEED])
+ priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
+ else
+ get_random_bytes(&priv->seed, sizeof(priv->seed));
+
+ return nft_validate_register_load(priv->sreg, len) &&
+ nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_hash_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_hash *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HASH_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
+ goto nla_put_failure;
+ if (priv->offset != 0)
+ if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
+ goto nla_put_failure;
return 0;
-}
-
-static void nft_hash_elem_destroy(void *ptr, void *arg)
-{
- nft_set_elem_destroy((const struct nft_set *)arg, ptr);
-}
-
-static void nft_hash_destroy(const struct nft_set *set)
-{
- struct nft_hash *priv = nft_set_priv(set);
- cancel_delayed_work_sync(&priv->gc_work);
- rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
- (void *)set);
+nla_put_failure:
+ return -1;
}
-static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
- struct nft_set_estimate *est)
-{
- unsigned int esize;
-
- esize = sizeof(struct nft_hash_elem);
- if (desc->size) {
- est->size = sizeof(struct nft_hash) +
- roundup_pow_of_two(desc->size * 4 / 3) *
- sizeof(struct nft_hash_elem *) +
- desc->size * esize;
- } else {
- /* Resizing happens when the load drops below 30% or goes
- * above 75%. The average of 52.5% load (approximated by 50%)
- * is used for the size estimation of the hash buckets,
- * meaning we calculate two buckets per element.
- */
- est->size = esize + 2 * sizeof(struct nft_hash_elem *);
- }
-
- est->class = NFT_SET_CLASS_O_1;
-
- return true;
-}
-
-static struct nft_set_ops nft_hash_ops __read_mostly = {
- .privsize = nft_hash_privsize,
- .elemsize = offsetof(struct nft_hash_elem, ext),
- .estimate = nft_hash_estimate,
+static struct nft_expr_type nft_hash_type;
+static const struct nft_expr_ops nft_hash_ops = {
+ .type = &nft_hash_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_hash)),
+ .eval = nft_hash_eval,
.init = nft_hash_init,
- .destroy = nft_hash_destroy,
- .insert = nft_hash_insert,
- .activate = nft_hash_activate,
- .deactivate = nft_hash_deactivate,
- .remove = nft_hash_remove,
- .lookup = nft_hash_lookup,
- .update = nft_hash_update,
- .walk = nft_hash_walk,
- .features = NFT_SET_MAP | NFT_SET_TIMEOUT,
+ .dump = nft_hash_dump,
+};
+
+static struct nft_expr_type nft_hash_type __read_mostly = {
+ .name = "hash",
+ .ops = &nft_hash_ops,
+ .policy = nft_hash_policy,
+ .maxattr = NFTA_HASH_MAX,
.owner = THIS_MODULE,
};
static int __init nft_hash_module_init(void)
{
- return nft_register_set(&nft_hash_ops);
+ return nft_register_expr(&nft_hash_type);
}
static void __exit nft_hash_module_exit(void)
{
- nft_unregister_set(&nft_hash_ops);
+ nft_unregister_expr(&nft_hash_type);
}
module_init(nft_hash_module_init);
module_exit(nft_hash_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
+MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("hash");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index db3b746858e3..728baf88295a 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -53,6 +53,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
tb[NFTA_IMMEDIATE_DATA]);
if (err < 0)
return err;
+
priv->dlen = desc.len;
priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
@@ -101,7 +102,6 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
return 0;
}
-static struct nft_expr_type nft_imm_type;
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -112,20 +112,10 @@ static const struct nft_expr_ops nft_imm_ops = {
.validate = nft_immediate_validate,
};
-static struct nft_expr_type nft_imm_type __read_mostly = {
+struct nft_expr_type nft_imm_type __read_mostly = {
.name = "immediate",
.ops = &nft_imm_ops,
.policy = nft_immediate_policy,
.maxattr = NFTA_IMMEDIATE_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_immediate_module_init(void)
-{
- return nft_register_expr(&nft_imm_type);
-}
-
-void nft_immediate_module_exit(void)
-{
- nft_unregister_expr(&nft_imm_type);
-}
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 070b98938e02..c6baf412236d 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -145,7 +145,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate);
+ priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate);
return 0;
}
@@ -170,7 +170,7 @@ static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_limit *priv = nft_expr_priv(expr);
- u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate);
+ u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
regs->verdict.code = NFT_BREAK;
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 24a73bb26e94..6f6e64423643 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -32,13 +32,15 @@ static void nft_log_eval(const struct nft_expr *expr,
{
const struct nft_log *priv = nft_expr_priv(expr);
- nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
- pkt->out, &priv->loginfo, "%s", priv->prefix);
+ nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
+ nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
+ priv->prefix);
}
static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
[NFTA_LOG_GROUP] = { .type = NLA_U16 },
- [NFTA_LOG_PREFIX] = { .type = NLA_STRING },
+ [NFTA_LOG_PREFIX] = { .type = NLA_STRING,
+ .len = NF_LOG_PREFIXLEN - 1 },
[NFTA_LOG_SNAPLEN] = { .type = NLA_U32 },
[NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 },
[NFTA_LOG_LEVEL] = { .type = NLA_U32 },
@@ -58,8 +60,11 @@ static int nft_log_init(const struct nft_ctx *ctx,
if (tb[NFTA_LOG_LEVEL] != NULL &&
tb[NFTA_LOG_GROUP] != NULL)
return -EINVAL;
- if (tb[NFTA_LOG_GROUP] != NULL)
+ if (tb[NFTA_LOG_GROUP] != NULL) {
li->type = NF_LOG_TYPE_ULOG;
+ if (tb[NFTA_LOG_FLAGS] != NULL)
+ return -EINVAL;
+ }
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
@@ -87,6 +92,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
if (tb[NFTA_LOG_FLAGS] != NULL) {
li->u.log.logflags =
ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
+ if (li->u.log.logflags & ~NF_LOG_MASK) {
+ err = -EINVAL;
+ goto err1;
+ }
}
break;
case NF_LOG_TYPE_ULOG:
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index e164325d1bc0..e21aea7e5ec8 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -35,22 +35,22 @@ static void nft_lookup_eval(const struct nft_expr *expr,
const struct nft_set_ext *ext;
bool found;
- found = set->ops->lookup(pkt->net, set, &regs->data[priv->sreg], &ext) ^
- priv->invert;
-
+ found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
+ &ext) ^ priv->invert;
if (!found) {
regs->verdict.code = NFT_BREAK;
return;
}
- if (found && set->flags & NFT_SET_MAP)
+ if (set->flags & NFT_SET_MAP)
nft_data_copy(&regs->data[priv->dreg],
nft_set_ext_data(ext), set->dlen);
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
- [NFTA_LOOKUP_SET] = { .type = NLA_STRING },
+ [NFTA_LOOKUP_SET] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
[NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
[NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
@@ -155,7 +155,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_lookup_type;
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -165,20 +164,10 @@ static const struct nft_expr_ops nft_lookup_ops = {
.dump = nft_lookup_dump,
};
-static struct nft_expr_type nft_lookup_type __read_mostly = {
+struct nft_expr_type nft_lookup_type __read_mostly = {
.name = "lookup",
.ops = &nft_lookup_ops,
.policy = nft_lookup_policy,
.maxattr = NFTA_LOOKUP_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_lookup_module_init(void)
-{
- return nft_register_expr(&nft_lookup_type);
-}
-
-void nft_lookup_module_exit(void)
-{
- nft_unregister_expr(&nft_lookup_type);
-}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 81b5ad6165ac..11ce016cd479 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -77,7 +77,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
}
}
- return 0;
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
}
EXPORT_SYMBOL_GPL(nft_masq_init);
@@ -105,4 +105,4 @@ nla_put_failure:
EXPORT_SYMBOL_GPL(nft_masq_dump);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8a6bc7630912..66c7f4b4c49b 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -36,7 +36,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
- const struct net_device *in = pkt->in, *out = pkt->out;
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
@@ -49,9 +49,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(__be16 *)dest = skb->protocol;
break;
case NFT_META_NFPROTO:
- *dest = pkt->pf;
+ *dest = nft_pf(pkt);
break;
case NFT_META_L4PROTO:
+ if (!pkt->tprot_set)
+ goto err;
*dest = pkt->tprot;
break;
case NFT_META_PRIORITY:
@@ -144,7 +146,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
}
- switch (pkt->pf) {
+ switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
if (ipv4_is_multicast(ip_hdr(skb)->daddr))
*dest = PACKET_MULTICAST;
@@ -308,6 +310,11 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
case NFPROTO_NETDEV:
hooks = 1 << NF_NETDEV_INGRESS;
break;
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = 1 << NF_INET_PRE_ROUTING;
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index ee2d71753746..19a7bf3236f9 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -209,7 +209,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(ctx->net, family);
}
static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -257,12 +257,21 @@ nla_put_failure:
return -1;
}
+static void
+nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ nf_ct_netns_put(ctx->net, priv->family);
+}
+
static struct nft_expr_type nft_nat_type;
static const struct nft_expr_ops nft_nat_ops = {
.type = &nft_nat_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
.eval = nft_nat_eval,
.init = nft_nat_init,
+ .destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
};
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
new file mode 100644
index 000000000000..a66b36097b8f
--- /dev/null
+++ b/net/netfilter/nft_numgen.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/static_key.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
+
+struct nft_ng_inc {
+ enum nft_registers dreg:8;
+ u32 modulus;
+ atomic_t counter;
+ u32 offset;
+};
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+ u32 nval, oval;
+
+ do {
+ oval = atomic_read(&priv->counter);
+ nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
+ } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+
+ regs->data[priv->dreg] = nval + priv->offset;
+}
+
+static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
+ [NFTA_NG_DREG] = { .type = NLA_U32 },
+ [NFTA_NG_MODULUS] = { .type = NLA_U32 },
+ [NFTA_NG_TYPE] = { .type = NLA_U32 },
+ [NFTA_NG_OFFSET] = { .type = NLA_U32 },
+};
+
+static int nft_ng_inc_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_NG_OFFSET])
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
+ priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+ if (priv->modulus == 0)
+ return -ERANGE;
+
+ if (priv->offset + priv->modulus - 1 < priv->offset)
+ return -EOVERFLOW;
+
+ priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
+ atomic_set(&priv->counter, priv->modulus - 1);
+
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
+ u32 modulus, enum nft_ng_types type, u32 offset)
+{
+ if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL,
+ priv->offset);
+}
+
+struct nft_ng_random {
+ enum nft_registers dreg:8;
+ u32 modulus;
+ u32 offset;
+};
+
+static void nft_ng_random_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_random *priv = nft_expr_priv(expr);
+ struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+ u32 val;
+
+ val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
+ regs->data[priv->dreg] = val + priv->offset;
+}
+
+static int nft_ng_random_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_NG_OFFSET])
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
+ priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+ if (priv->modulus == 0)
+ return -ERANGE;
+
+ if (priv->offset + priv->modulus - 1 < priv->offset)
+ return -EOVERFLOW;
+
+ prandom_init_once(&nft_numgen_prandom_state);
+
+ priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
+
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM,
+ priv->offset);
+}
+
+static struct nft_expr_type nft_ng_type;
+static const struct nft_expr_ops nft_ng_inc_ops = {
+ .type = &nft_ng_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+ .eval = nft_ng_inc_eval,
+ .init = nft_ng_inc_init,
+ .dump = nft_ng_inc_dump,
+};
+
+static const struct nft_expr_ops nft_ng_random_ops = {
+ .type = &nft_ng_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
+ .eval = nft_ng_random_eval,
+ .init = nft_ng_random_init,
+ .dump = nft_ng_random_dump,
+};
+
+static const struct nft_expr_ops *
+nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
+{
+ u32 type;
+
+ if (!tb[NFTA_NG_DREG] ||
+ !tb[NFTA_NG_MODULUS] ||
+ !tb[NFTA_NG_TYPE])
+ return ERR_PTR(-EINVAL);
+
+ type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE]));
+
+ switch (type) {
+ case NFT_NG_INCREMENTAL:
+ return &nft_ng_inc_ops;
+ case NFT_NG_RANDOM:
+ return &nft_ng_random_ops;
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_ng_type __read_mostly = {
+ .name = "numgen",
+ .select_ops = &nft_ng_select_ops,
+ .policy = nft_ng_policy,
+ .maxattr = NFTA_NG_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_ng_module_init(void)
+{
+ return nft_register_expr(&nft_ng_type);
+}
+
+static void __exit nft_ng_module_exit(void)
+{
+ nft_unregister_expr(&nft_ng_type);
+}
+
+module_init(nft_ng_module_init);
+module_exit(nft_ng_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("numgen");
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
new file mode 100644
index 000000000000..1ae8c49ca4a1
--- /dev/null
+++ b/net/netfilter/nft_objref.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
+
+static void nft_objref_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ u32 objtype;
+
+ if (!tb[NFTA_OBJREF_IMM_NAME] ||
+ !tb[NFTA_OBJREF_IMM_TYPE])
+ return -EINVAL;
+
+ objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
+ obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+ genmask);
+ if (IS_ERR(obj))
+ return -ENOENT;
+
+ nft_objref_priv(expr) = obj;
+ obj->use++;
+
+ return 0;
+}
+
+static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_object *obj = nft_objref_priv(expr);
+
+ if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
+ nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_objref_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ obj->use--;
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_ops = {
+ .type = &nft_objref_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
+ .eval = nft_objref_eval,
+ .init = nft_objref_init,
+ .destroy = nft_objref_destroy,
+ .dump = nft_objref_dump,
+};
+
+struct nft_objref_map {
+ struct nft_set *set;
+ enum nft_registers sreg:8;
+ struct nft_set_binding binding;
+};
+
+static void nft_objref_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+ const struct nft_set *set = priv->set;
+ const struct nft_set_ext *ext;
+ struct nft_object *obj;
+ bool found;
+
+ found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
+ &ext);
+ if (!found) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ obj = *nft_set_ext_obj(ext);
+ obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set *set;
+ int err;
+
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask);
+ if (IS_ERR(set)) {
+ if (tb[NFTA_OBJREF_SET_ID]) {
+ set = nf_tables_set_lookup_byid(ctx->net,
+ tb[NFTA_OBJREF_SET_ID],
+ genmask);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (!(set->flags & NFT_SET_OBJECT))
+ return -EINVAL;
+
+ priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
+ err = nft_validate_register_load(priv->sreg, set->klen);
+ if (err < 0)
+ return err;
+
+ priv->binding.flags = set->flags & NFT_SET_OBJECT;
+
+ err = nf_tables_bind_set(ctx, set, &priv->binding);
+ if (err < 0)
+ return err;
+
+ priv->set = set;
+ return 0;
+}
+
+static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) ||
+ nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_objref_map_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_map_ops = {
+ .type = &nft_objref_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
+ .eval = nft_objref_map_eval,
+ .init = nft_objref_map_init,
+ .destroy = nft_objref_map_destroy,
+ .dump = nft_objref_map_dump,
+};
+
+static const struct nft_expr_ops *
+nft_objref_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_OBJREF_SET_SREG] &&
+ (tb[NFTA_OBJREF_SET_NAME] ||
+ tb[NFTA_OBJREF_SET_ID]))
+ return &nft_objref_map_ops;
+ else if (tb[NFTA_OBJREF_IMM_NAME] &&
+ tb[NFTA_OBJREF_IMM_TYPE])
+ return &nft_objref_ops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
+ [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING,
+ .len = NFT_OBJ_MAXNAMELEN - 1 },
+ [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 },
+ [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 },
+ [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
+ [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_objref_type __read_mostly = {
+ .name = "objref",
+ .select_ops = nft_objref_select_ops,
+ .policy = nft_objref_policy,
+ .maxattr = NFTA_OBJREF_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_objref_module_init(void)
+{
+ return nft_register_expr(&nft_objref_type);
+}
+
+static void __exit nft_objref_module_exit(void)
+{
+ nft_unregister_expr(&nft_objref_type);
+}
+
+module_init(nft_objref_module_init);
+module_exit(nft_objref_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("objref");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 12cd4bf16d17..7d699bbd45b0 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,10 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
@@ -92,6 +97,8 @@ static void nft_payload_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!pkt->tprot_set)
+ goto err;
offset = pkt->xt.thoff;
break;
default:
@@ -146,7 +153,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_payload_type;
static const struct nft_expr_ops nft_payload_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
@@ -163,6 +169,103 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.dump = nft_payload_dump,
};
+static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
+{
+ *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ if (*sum == 0)
+ *sum = CSUM_MANGLED_0;
+}
+
+static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff)
+{
+ struct udphdr *uh, _uh;
+
+ uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh);
+ if (!uh)
+ return false;
+
+ return uh->check;
+}
+
+static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ unsigned int *l4csum_offset)
+{
+ switch (pkt->tprot) {
+ case IPPROTO_TCP:
+ *l4csum_offset = offsetof(struct tcphdr, check);
+ break;
+ case IPPROTO_UDP:
+ if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+ return -1;
+ /* Fall through. */
+ case IPPROTO_UDPLITE:
+ *l4csum_offset = offsetof(struct udphdr, check);
+ break;
+ case IPPROTO_ICMPV6:
+ *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ break;
+ default:
+ return -1;
+ }
+
+ *l4csum_offset += pkt->xt.thoff;
+ return 0;
+}
+
+static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ __wsum fsum, __wsum tsum)
+{
+ int l4csum_offset;
+ __sum16 sum;
+
+ /* If we cannot determine layer 4 checksum offset or this packet doesn't
+ * require layer 4 checksum recalculation, skip this packet.
+ */
+ if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0)
+ return 0;
+
+ if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ /* Checksum mangling for an arbitrary amount of bytes, based on
+ * inet_proto_csum_replace*() functions.
+ */
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ nft_csum_replace(&sum, fsum, tsum);
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum),
+ tsum);
+ }
+ } else {
+ sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum),
+ tsum));
+ }
+
+ if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
+ skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
+ __wsum fsum, __wsum tsum, int csum_offset)
+{
+ __sum16 sum;
+
+ if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ nft_csum_replace(&sum, fsum, tsum);
+ if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
+ skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ return 0;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -172,7 +275,6 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
const u32 *src = &regs->data[priv->sreg];
int offset, csum_offset;
__wsum fsum, tsum;
- __sum16 sum;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
@@ -184,6 +286,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!pkt->tprot_set)
+ goto err;
offset = pkt->xt.thoff;
break;
default:
@@ -193,21 +297,18 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
csum_offset = offset + priv->csum_offset;
offset += priv->offset;
- if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
(priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
skb->ip_summed != CHECKSUM_PARTIAL)) {
- if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
- goto err;
-
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
- sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum),
- tsum));
- if (sum == 0)
- sum = CSUM_MANGLED_0;
- if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
- skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset))
+ goto err;
+
+ if (priv->csum_flags &&
+ nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0)
goto err;
}
@@ -237,6 +338,15 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
priv->csum_offset =
ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
+ u32 flags;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS]));
+ if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR)
+ return -EINVAL;
+
+ priv->csum_flags = flags;
+ }
switch (priv->csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
@@ -259,7 +369,8 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr
nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) ||
nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) ||
nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET,
- htonl(priv->csum_offset)))
+ htonl(priv->csum_offset)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags)))
goto nla_put_failure;
return 0;
@@ -316,20 +427,10 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
return &nft_payload_ops;
}
-static struct nft_expr_type nft_payload_type __read_mostly = {
+struct nft_expr_type nft_payload_type __read_mostly = {
.name = "payload",
.select_ops = nft_payload_select_ops,
.policy = nft_payload_policy,
.maxattr = NFTA_PAYLOAD_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_payload_module_init(void)
-{
- return nft_register_expr(&nft_payload_type);
-}
-
-void nft_payload_module_exit(void)
-{
- nft_unregister_expr(&nft_payload_type);
-}
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 61d216eb7917..dbb6aaff67ec 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -22,9 +22,10 @@
static u32 jhash_initval __read_mostly;
struct nft_queue {
- u16 queuenum;
- u16 queues_total;
- u16 flags;
+ enum nft_registers sreg_qnum:8;
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
};
static void nft_queue_eval(const struct nft_expr *expr,
@@ -37,12 +38,12 @@ static void nft_queue_eval(const struct nft_expr *expr,
if (priv->queues_total > 1) {
if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) {
- int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
queue = priv->queuenum + cpu % priv->queues_total;
} else {
queue = nfqueue_hash(pkt->skb, queue,
- priv->queues_total, pkt->pf,
+ priv->queues_total, nft_pf(pkt),
jhash_initval);
}
}
@@ -54,31 +55,78 @@ static void nft_queue_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static void nft_queue_sreg_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+ u32 queue, ret;
+
+ queue = regs->data[priv->sreg_qnum];
+
+ ret = NF_QUEUE_NR(queue);
+ if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ regs->verdict.code = ret;
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
[NFTA_QUEUE_FLAGS] = { .type = NLA_U16 },
+ [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 },
};
static int nft_queue_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_queue *priv = nft_expr_priv(expr);
+ u32 maxid;
- if (tb[NFTA_QUEUE_NUM] == NULL)
- return -EINVAL;
-
- init_hashrandom(&jhash_initval);
priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
- if (tb[NFTA_QUEUE_TOTAL] != NULL)
+ if (tb[NFTA_QUEUE_TOTAL])
priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
- if (tb[NFTA_QUEUE_FLAGS] != NULL) {
+ else
+ priv->queues_total = 1;
+
+ if (priv->queues_total == 0)
+ return -EINVAL;
+
+ maxid = priv->queues_total - 1 + priv->queuenum;
+ if (maxid > U16_MAX)
+ return -ERANGE;
+
+ if (tb[NFTA_QUEUE_FLAGS]) {
+ priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
+ if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nft_queue_sreg_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
+ err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_QUEUE_FLAGS]) {
priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
return -EINVAL;
+ if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT)
+ return -EOPNOTSUPP;
}
+
return 0;
}
@@ -97,6 +145,21 @@ nla_put_failure:
return -1;
}
+static int
+nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_queue *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) ||
+ nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct nft_expr_type nft_queue_type;
static const struct nft_expr_ops nft_queue_ops = {
.type = &nft_queue_type,
@@ -106,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = {
.dump = nft_queue_dump,
};
+static const struct nft_expr_ops nft_queue_sreg_ops = {
+ .type = &nft_queue_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)),
+ .eval = nft_queue_sreg_eval,
+ .init = nft_queue_sreg_init,
+ .dump = nft_queue_sreg_dump,
+};
+
+static const struct nft_expr_ops *
+nft_queue_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM])
+ return ERR_PTR(-EINVAL);
+
+ init_hashrandom(&jhash_initval);
+
+ if (tb[NFTA_QUEUE_NUM])
+ return &nft_queue_ops;
+
+ if (tb[NFTA_QUEUE_SREG_QNUM])
+ return &nft_queue_sreg_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
static struct nft_expr_type nft_queue_type __read_mostly = {
.name = "queue",
- .ops = &nft_queue_ops,
+ .select_ops = &nft_queue_select_ops,
.policy = nft_queue_policy,
.maxattr = NFTA_QUEUE_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
new file mode 100644
index 000000000000..2d6fe3559912
--- /dev/null
+++ b/net/netfilter/nft_quota.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_quota {
+ u64 quota;
+ unsigned long flags;
+ atomic64_t consumed;
+};
+
+static inline bool nft_overquota(struct nft_quota *priv,
+ const struct sk_buff *skb)
+{
+ return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
+}
+
+static inline bool nft_quota_invert(struct nft_quota *priv)
+{
+ return priv->flags & NFT_QUOTA_F_INV;
+}
+
+static inline void nft_quota_do_eval(struct nft_quota *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
+ [NFTA_QUOTA_BYTES] = { .type = NLA_U64 },
+ [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 },
+ [NFTA_QUOTA_CONSUMED] = { .type = NLA_U64 },
+};
+
+#define NFT_QUOTA_DEPLETED_BIT 1 /* From NFT_QUOTA_F_DEPLETED. */
+
+static void nft_quota_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+ bool overquota;
+
+ overquota = nft_overquota(priv, pkt->skb);
+ if (overquota ^ nft_quota_invert(priv))
+ regs->verdict.code = NFT_BREAK;
+
+ if (overquota &&
+ !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
+ nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
+ NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
+}
+
+static int nft_quota_do_init(const struct nlattr * const tb[],
+ struct nft_quota *priv)
+{
+ unsigned long flags = 0;
+ u64 quota, consumed = 0;
+
+ if (!tb[NFTA_QUOTA_BYTES])
+ return -EINVAL;
+
+ quota = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_BYTES]));
+ if (quota > S64_MAX)
+ return -EOVERFLOW;
+
+ if (tb[NFTA_QUOTA_CONSUMED]) {
+ consumed = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_CONSUMED]));
+ if (consumed > quota)
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_QUOTA_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
+ if (flags & ~NFT_QUOTA_F_INV)
+ return -EINVAL;
+ if (flags & NFT_QUOTA_F_DEPLETED)
+ return -EOPNOTSUPP;
+ }
+
+ priv->quota = quota;
+ priv->flags = flags;
+ atomic64_set(&priv->consumed, consumed);
+
+ return 0;
+}
+
+static int nft_quota_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
+ bool reset)
+{
+ u64 consumed, consumed_cap;
+ u32 flags = priv->flags;
+
+ /* Since we inconditionally increment consumed quota for each packet
+ * that we see, don't go over the quota boundary in what we send to
+ * userspace.
+ */
+ consumed = atomic64_read(&priv->consumed);
+ if (consumed >= priv->quota) {
+ consumed_cap = priv->quota;
+ flags |= NFT_QUOTA_F_DEPLETED;
+ } else {
+ consumed_cap = consumed;
+ }
+
+ if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
+ NFTA_QUOTA_PAD) ||
+ nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap),
+ NFTA_QUOTA_PAD) ||
+ nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
+ goto nla_put_failure;
+
+ if (reset) {
+ atomic64_sub(consumed, &priv->consumed);
+ clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
+ }
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
+ bool reset)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_dump(skb, priv, reset);
+}
+
+static struct nft_object_type nft_quota_obj __read_mostly = {
+ .type = NFT_OBJECT_QUOTA,
+ .size = sizeof(struct nft_quota),
+ .maxattr = NFTA_QUOTA_MAX,
+ .policy = nft_quota_policy,
+ .init = nft_quota_obj_init,
+ .eval = nft_quota_obj_eval,
+ .dump = nft_quota_obj_dump,
+ .owner = THIS_MODULE,
+};
+
+static void nft_quota_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_dump(skb, priv, false);
+}
+
+static struct nft_expr_type nft_quota_type;
+static const struct nft_expr_ops nft_quota_ops = {
+ .type = &nft_quota_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
+ .eval = nft_quota_eval,
+ .init = nft_quota_init,
+ .dump = nft_quota_dump,
+};
+
+static struct nft_expr_type nft_quota_type __read_mostly = {
+ .name = "quota",
+ .ops = &nft_quota_ops,
+ .policy = nft_quota_policy,
+ .maxattr = NFTA_QUOTA_MAX,
+ .flags = NFT_EXPR_STATEFUL,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_quota_module_init(void)
+{
+ int err;
+
+ err = nft_register_obj(&nft_quota_obj);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_quota_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_quota_obj);
+ return err;
+}
+
+static void __exit nft_quota_module_exit(void)
+{
+ nft_unregister_expr(&nft_quota_type);
+ nft_unregister_obj(&nft_quota_obj);
+}
+
+module_init(nft_quota_module_init);
+module_exit(nft_quota_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("quota");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
new file mode 100644
index 000000000000..9edc74eedc10
--- /dev/null
+++ b/net/netfilter/nft_range.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_range_expr {
+ struct nft_data data_from;
+ struct nft_data data_to;
+ enum nft_registers sreg:8;
+ u8 len;
+ enum nft_range_ops op:8;
+};
+
+static void nft_range_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_range_expr *priv = nft_expr_priv(expr);
+ int d1, d2;
+
+ d1 = memcmp(&regs->data[priv->sreg], &priv->data_from, priv->len);
+ d2 = memcmp(&regs->data[priv->sreg], &priv->data_to, priv->len);
+ switch (priv->op) {
+ case NFT_RANGE_EQ:
+ if (d1 < 0 || d2 > 0)
+ regs->verdict.code = NFT_BREAK;
+ break;
+ case NFT_RANGE_NEQ:
+ if (d1 >= 0 && d2 <= 0)
+ regs->verdict.code = NFT_BREAK;
+ break;
+ }
+}
+
+static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
+ [NFTA_RANGE_SREG] = { .type = NLA_U32 },
+ [NFTA_RANGE_OP] = { .type = NLA_U32 },
+ [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED },
+ [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_range_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc_from, desc_to;
+ int err;
+ u32 op;
+
+ if (!tb[NFTA_RANGE_SREG] ||
+ !tb[NFTA_RANGE_OP] ||
+ !tb[NFTA_RANGE_FROM_DATA] ||
+ !tb[NFTA_RANGE_TO_DATA])
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
+ &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
+ &desc_to, tb[NFTA_RANGE_TO_DATA]);
+ if (err < 0)
+ goto err1;
+
+ if (desc_from.len != desc_to.len) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
+ err = nft_validate_register_load(priv->sreg, desc_from.len);
+ if (err < 0)
+ goto err2;
+
+ err = nft_parse_u32_check(tb[NFTA_RANGE_OP], U8_MAX, &op);
+ if (err < 0)
+ goto err2;
+
+ switch (op) {
+ case NFT_RANGE_EQ:
+ case NFT_RANGE_NEQ:
+ break;
+ default:
+ err = -EINVAL;
+ goto err2;
+ }
+
+ priv->op = op;
+ priv->len = desc_from.len;
+ return 0;
+err2:
+ nft_data_uninit(&priv->data_to, desc_to.type);
+err1:
+ nft_data_uninit(&priv->data_from, desc_from.type);
+ return err;
+}
+
+static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_range_expr *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from,
+ NFT_DATA_VALUE, priv->len) < 0 ||
+ nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_expr_ops nft_range_ops = {
+ .type = &nft_range_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
+ .eval = nft_range_eval,
+ .init = nft_range_init,
+ .dump = nft_range_dump,
+};
+
+struct nft_expr_type nft_range_type __read_mostly = {
+ .name = "range",
+ .ops = &nft_range_ops,
+ .policy = nft_range_policy,
+ .maxattr = NFTA_RANGE_MAX,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 03f7bf40ae75..40dcd05146d5 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -79,7 +79,7 @@ int nft_redir_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
}
EXPORT_SYMBOL_GPL(nft_redir_init);
@@ -108,4 +108,4 @@ nla_put_failure:
EXPORT_SYMBOL_GPL(nft_redir_dump);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index e79d9ca2ffee..9e90a02cb104 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -23,36 +23,36 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
{
struct nft_reject *priv = nft_expr_priv(expr);
- switch (pkt->pf) {
+ switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->hook);
+ nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
nft_reject_icmp_code(priv->icmp_code),
- pkt->hook);
+ nft_hook(pkt));
break;
}
break;
case NFPROTO_IPV6:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
- pkt->hook);
+ nf_send_unreach6(nft_net(pkt), pkt->skb,
+ priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
- nf_send_unreach6(pkt->net, pkt->skb,
+ nf_send_unreach6(nft_net(pkt), pkt->skb,
nft_reject_icmpv6_code(priv->icmp_code),
- pkt->hook);
+ nft_hook(pkt));
break;
}
break;
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
new file mode 100644
index 000000000000..d3eb640bc784
--- /dev/null
+++ b/net/netfilter/nft_rt.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2016 Anders K. Pedersen <akp@cohaesio.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/dst.h>
+#include <net/ip6_route.h>
+#include <net/route.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+struct nft_rt {
+ enum nft_rt_keys key:8;
+ enum nft_registers dreg:8;
+};
+
+void nft_rt_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_rt *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ const struct dst_entry *dst;
+
+ dst = skb_dst(skb);
+ if (!dst)
+ goto err;
+
+ switch (priv->key) {
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_RT_CLASSID:
+ *dest = dst->tclassid;
+ break;
+#endif
+ case NFT_RT_NEXTHOP4:
+ if (nft_pf(pkt) != NFPROTO_IPV4)
+ goto err;
+
+ *dest = rt_nexthop((const struct rtable *)dst,
+ ip_hdr(skb)->daddr);
+ break;
+ case NFT_RT_NEXTHOP6:
+ if (nft_pf(pkt) != NFPROTO_IPV6)
+ goto err;
+
+ memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
+ &ipv6_hdr(skb)->daddr),
+ sizeof(struct in6_addr));
+ break;
+ default:
+ WARN_ON(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = {
+ [NFTA_RT_DREG] = { .type = NLA_U32 },
+ [NFTA_RT_KEY] = { .type = NLA_U32 },
+};
+
+int nft_rt_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_rt *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ if (tb[NFTA_RT_KEY] == NULL ||
+ tb[NFTA_RT_DREG] == NULL)
+ return -EINVAL;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_RT_KEY]));
+ switch (priv->key) {
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_RT_CLASSID:
+#endif
+ case NFT_RT_NEXTHOP4:
+ len = sizeof(u32);
+ break;
+ case NFT_RT_NEXTHOP6:
+ len = sizeof(struct in6_addr);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+int nft_rt_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_rt *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_RT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_RT_DREG, priv->dreg))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_rt_type;
+static const struct nft_expr_ops nft_rt_get_ops = {
+ .type = &nft_rt_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)),
+ .eval = nft_rt_get_eval,
+ .init = nft_rt_get_init,
+ .dump = nft_rt_get_dump,
+};
+
+static struct nft_expr_type nft_rt_type __read_mostly = {
+ .name = "rt",
+ .ops = &nft_rt_get_ops,
+ .policy = nft_rt_policy,
+ .maxattr = NFTA_RT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_rt_module_init(void)
+{
+ return nft_register_expr(&nft_rt_type);
+}
+
+static void __exit nft_rt_module_exit(void)
+{
+ nft_unregister_expr(&nft_rt_type);
+}
+
+module_init(nft_rt_module_init);
+module_exit(nft_rt_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
+MODULE_ALIAS_NFT_EXPR("rt");
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
new file mode 100644
index 000000000000..e36069fb76ae
--- /dev/null
+++ b/net/netfilter/nft_set_hash.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/workqueue.h>
+#include <linux/rhashtable.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+/* We target a hash table size of 4, element hint is 75% of final size */
+#define NFT_HASH_ELEMENT_HINT 3
+
+struct nft_hash {
+ struct rhashtable ht;
+ struct delayed_work gc_work;
+};
+
+struct nft_hash_elem {
+ struct rhash_head node;
+ struct nft_set_ext ext;
+};
+
+struct nft_hash_cmp_arg {
+ const struct nft_set *set;
+ const u32 *key;
+ u8 genmask;
+};
+
+static const struct rhashtable_params nft_hash_params;
+
+static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
+{
+ const struct nft_hash_cmp_arg *arg = data;
+
+ return jhash(arg->key, len, seed);
+}
+
+static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct nft_hash_elem *he = data;
+
+ return jhash(nft_set_ext_key(&he->ext), len, seed);
+}
+
+static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nft_hash_cmp_arg *x = arg->key;
+ const struct nft_hash_elem *he = ptr;
+
+ if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
+ return 1;
+ if (nft_set_elem_expired(&he->ext))
+ return 1;
+ if (!nft_set_elem_active(&he->ext, x->genmask))
+ return 1;
+ return 0;
+}
+
+static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_elem *he;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = nft_genmask_cur(net),
+ .set = set,
+ .key = key,
+ };
+
+ he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+ if (he != NULL)
+ *ext = &he->ext;
+
+ return !!he;
+}
+
+static bool nft_hash_update(struct nft_set *set, const u32 *key,
+ void *(*new)(struct nft_set *,
+ const struct nft_expr *,
+ struct nft_regs *regs),
+ const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_set_ext **ext)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he, *prev;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = NFT_GENMASK_ANY,
+ .set = set,
+ .key = key,
+ };
+
+ he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+ if (he != NULL)
+ goto out;
+
+ he = new(set, expr, regs);
+ if (he == NULL)
+ goto err1;
+
+ prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
+ nft_hash_params);
+ if (IS_ERR(prev))
+ goto err2;
+
+ /* Another cpu may race to insert the element with the same key */
+ if (prev) {
+ nft_set_elem_destroy(set, he, true);
+ he = prev;
+ }
+
+out:
+ *ext = &he->ext;
+ return true;
+
+err2:
+ nft_set_elem_destroy(set, he, true);
+err1:
+ return false;
+}
+
+static int nft_hash_insert(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_set_ext **ext)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = nft_genmask_next(net),
+ .set = set,
+ .key = elem->key.val.data,
+ };
+ struct nft_hash_elem *prev;
+
+ prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
+ nft_hash_params);
+ if (IS_ERR(prev))
+ return PTR_ERR(prev);
+ if (prev) {
+ *ext = &prev->ext;
+ return -EEXIST;
+ }
+ return 0;
+}
+
+static void nft_hash_activate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash_elem *he = elem->priv;
+
+ nft_set_elem_change_active(net, set, &he->ext);
+ nft_set_elem_clear_busy(&he->ext);
+}
+
+static bool nft_hash_deactivate_one(const struct net *net,
+ const struct nft_set *set, void *priv)
+{
+ struct nft_hash_elem *he = priv;
+
+ if (!nft_set_elem_mark_busy(&he->ext) ||
+ !nft_is_active(net, &he->ext)) {
+ nft_set_elem_change_active(net, set, &he->ext);
+ return true;
+ }
+ return false;
+}
+
+static void *nft_hash_deactivate(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = nft_genmask_next(net),
+ .set = set,
+ .key = elem->key.val.data,
+ };
+
+ rcu_read_lock();
+ he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+ if (he != NULL &&
+ !nft_hash_deactivate_one(net, set, he))
+ he = NULL;
+
+ rcu_read_unlock();
+
+ return he;
+}
+
+static void nft_hash_remove(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he = elem->priv;
+
+ rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+}
+
+static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he;
+ struct rhashtable_iter hti;
+ struct nft_set_elem elem;
+ int err;
+
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
+ iter->err = err;
+ if (err)
+ return;
+
+ err = rhashtable_walk_start(&hti);
+ if (err && err != -EAGAIN) {
+ iter->err = err;
+ goto out;
+ }
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ err = PTR_ERR(he);
+ if (err != -EAGAIN) {
+ iter->err = err;
+ goto out;
+ }
+
+ continue;
+ }
+
+ if (iter->count < iter->skip)
+ goto cont;
+ if (nft_set_elem_expired(&he->ext))
+ goto cont;
+ if (!nft_set_elem_active(&he->ext, iter->genmask))
+ goto cont;
+
+ elem.priv = he;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+ if (iter->err < 0)
+ goto out;
+
+cont:
+ iter->count++;
+ }
+
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+}
+
+static void nft_hash_gc(struct work_struct *work)
+{
+ struct nft_set *set;
+ struct nft_hash_elem *he;
+ struct nft_hash *priv;
+ struct nft_set_gc_batch *gcb = NULL;
+ struct rhashtable_iter hti;
+ int err;
+
+ priv = container_of(work, struct nft_hash, gc_work.work);
+ set = nft_set_container_of(priv);
+
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
+ if (err)
+ goto schedule;
+
+ err = rhashtable_walk_start(&hti);
+ if (err && err != -EAGAIN)
+ goto out;
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN)
+ goto out;
+ continue;
+ }
+
+ if (!nft_set_elem_expired(&he->ext))
+ continue;
+ if (nft_set_elem_mark_busy(&he->ext))
+ continue;
+
+ gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+ if (gcb == NULL)
+ goto out;
+ rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+ atomic_dec(&set->nelems);
+ nft_set_gc_batch_add(gcb, he);
+ }
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ nft_set_gc_batch_complete(gcb);
+schedule:
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+}
+
+static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
+{
+ return sizeof(struct nft_hash);
+}
+
+static const struct rhashtable_params nft_hash_params = {
+ .head_offset = offsetof(struct nft_hash_elem, node),
+ .hashfn = nft_hash_key,
+ .obj_hashfn = nft_hash_obj,
+ .obj_cmpfn = nft_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+static int nft_hash_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const tb[])
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct rhashtable_params params = nft_hash_params;
+ int err;
+
+ params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT;
+ params.key_len = set->klen;
+
+ err = rhashtable_init(&priv->ht, &params);
+ if (err < 0)
+ return err;
+
+ INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
+ if (set->flags & NFT_SET_TIMEOUT)
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+ return 0;
+}
+
+static void nft_hash_elem_destroy(void *ptr, void *arg)
+{
+ nft_set_elem_destroy((const struct nft_set *)arg, ptr, true);
+}
+
+static void nft_hash_destroy(const struct nft_set *set)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+
+ cancel_delayed_work_sync(&priv->gc_work);
+ rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
+ (void *)set);
+}
+
+static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned int esize;
+
+ esize = sizeof(struct nft_hash_elem);
+ if (desc->size) {
+ est->size = sizeof(struct nft_hash) +
+ roundup_pow_of_two(desc->size * 4 / 3) *
+ sizeof(struct nft_hash_elem *) +
+ desc->size * esize;
+ } else {
+ /* Resizing happens when the load drops below 30% or goes
+ * above 75%. The average of 52.5% load (approximated by 50%)
+ * is used for the size estimation of the hash buckets,
+ * meaning we calculate two buckets per element.
+ */
+ est->size = esize + 2 * sizeof(struct nft_hash_elem *);
+ }
+
+ est->class = NFT_SET_CLASS_O_1;
+
+ return true;
+}
+
+static struct nft_set_ops nft_hash_ops __read_mostly = {
+ .privsize = nft_hash_privsize,
+ .elemsize = offsetof(struct nft_hash_elem, ext),
+ .estimate = nft_hash_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .insert = nft_hash_insert,
+ .activate = nft_hash_activate,
+ .deactivate = nft_hash_deactivate,
+ .deactivate_one = nft_hash_deactivate_one,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup,
+ .update = nft_hash_update,
+ .walk = nft_hash_walk,
+ .features = NFT_SET_MAP | NFT_SET_TIMEOUT,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_hash_module_init(void)
+{
+ return nft_register_set(&nft_hash_ops);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+ nft_unregister_set(&nft_hash_ops);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_set_rbtree.c
index ffe9ae062d23..f06f55ee516d 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -96,7 +96,8 @@ out:
}
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
- struct nft_rbtree_elem *new)
+ struct nft_rbtree_elem *new,
+ struct nft_set_ext **ext)
{
struct nft_rbtree *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
@@ -124,8 +125,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
else if (!nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_end(new))
p = &parent->rb_right;
- else
+ else {
+ *ext = &rbe->ext;
return -EEXIST;
+ }
}
}
}
@@ -135,13 +138,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
}
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_set_elem *elem,
+ struct nft_set_ext **ext)
{
struct nft_rbtree_elem *rbe = elem->priv;
int err;
spin_lock_bh(&nft_rbtree_lock);
- err = __nft_rbtree_insert(net, set, rbe);
+ err = __nft_rbtree_insert(net, set, rbe, ext);
spin_unlock_bh(&nft_rbtree_lock);
return err;
@@ -167,6 +171,15 @@ static void nft_rbtree_activate(const struct net *net,
nft_set_elem_change_active(net, set, &rbe->ext);
}
+static bool nft_rbtree_deactivate_one(const struct net *net,
+ const struct nft_set *set, void *priv)
+{
+ struct nft_rbtree_elem *rbe = priv;
+
+ nft_set_elem_change_active(net, set, &rbe->ext);
+ return true;
+}
+
static void *nft_rbtree_deactivate(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem)
@@ -200,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
parent = parent->rb_right;
continue;
}
- nft_set_elem_change_active(net, set, &rbe->ext);
+ nft_rbtree_deactivate_one(net, set, rbe);
return rbe;
}
}
@@ -208,7 +221,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
}
static void nft_rbtree_walk(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
struct nft_set_iter *iter)
{
const struct nft_rbtree *priv = nft_set_priv(set);
@@ -262,7 +275,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- nft_set_elem_destroy(set, rbe);
+ nft_set_elem_destroy(set, rbe, true);
}
}
@@ -291,6 +304,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
.insert = nft_rbtree_insert,
.remove = nft_rbtree_remove,
.deactivate = nft_rbtree_deactivate,
+ .deactivate_one = nft_rbtree_deactivate_one,
.activate = nft_rbtree_activate,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e0aa7c1d0224..2ff499680cc6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define XT_PCPU_BLOCK_SIZE 4096
struct compat_delta {
unsigned int offset; /* offset in kernel */
@@ -958,7 +959,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!info) {
- info = vmalloc(sz);
+ info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_HIGHMEM,
+ PAGE_KERNEL);
if (!info)
return NULL;
}
@@ -982,7 +985,7 @@ void xt_free_table_info(struct xt_table_info *info)
}
EXPORT_SYMBOL(xt_free_table_info);
-/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+/* Find table by name, grabs mutex & ref. Returns NULL on error. */
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
@@ -1513,7 +1516,7 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
if (!num_hooks)
return ERR_PTR(-EINVAL);
- ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
+ ops = kcalloc(num_hooks, sizeof(*ops), GFP_KERNEL);
if (ops == NULL)
return ERR_PTR(-ENOMEM);
@@ -1615,6 +1618,59 @@ void xt_proto_fini(struct net *net, u_int8_t af)
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
+/**
+ * xt_percpu_counter_alloc - allocate x_tables rule counter
+ *
+ * @state: pointer to xt_percpu allocation state
+ * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
+ *
+ * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
+ * contain the address of the real (percpu) counter.
+ *
+ * Rule evaluation needs to use xt_get_this_cpu_counter() helper
+ * to fetch the real percpu counter.
+ *
+ * To speed up allocation and improve data locality, a 4kb block is
+ * allocated.
+ *
+ * xt_percpu_counter_alloc_state contains the base address of the
+ * allocated page and the current sub-offset.
+ *
+ * returns false on error.
+ */
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+ struct xt_counters *counter)
+{
+ BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2));
+
+ if (nr_cpu_ids <= 1)
+ return true;
+
+ if (!state->mem) {
+ state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE,
+ XT_PCPU_BLOCK_SIZE);
+ if (!state->mem)
+ return false;
+ }
+ counter->pcnt = (__force unsigned long)(state->mem + state->off);
+ state->off += sizeof(*counter);
+ if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) {
+ state->mem = NULL;
+ state->off = 0;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
+
+void xt_percpu_counter_free(struct xt_counters *counters)
+{
+ unsigned long pcnt = counters->pcnt;
+
+ if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0)
+ free_percpu((void __percpu *)pcnt);
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+
static int __net_init xt_net_init(struct net *net)
{
int i;
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 4973cbddc446..19247a17e511 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -132,9 +132,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
goto errout;
audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
- info->type, par->hooknum, skb->len,
- par->in ? par->in->name : "?",
- par->out ? par->out->name : "?");
+ info->type, xt_hooknum(par), skb->len,
+ xt_in(par) ? xt_inname(par) : "?",
+ xt_out(par) ? xt_outname(par) : "?");
if (skb->mark)
audit_log_format(ab, " mark=%#x", skb->mark);
@@ -144,7 +144,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
ntohs(eth_hdr(skb)->h_proto));
- if (par->family == NFPROTO_BRIDGE) {
+ if (xt_family(par) == NFPROTO_BRIDGE) {
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
audit_ip4(ab, skb);
@@ -157,7 +157,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
}
}
- switch (par->family) {
+ switch (xt_family(par)) {
case NFPROTO_IPV4:
audit_ip4(ab, skb);
break;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index e04dc282e3bb..da56c06a443c 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -106,7 +106,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -115,7 +115,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target connsecmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 6669e68d589e..95c750358747 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -216,7 +216,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err1;
#endif
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
goto err1;
@@ -260,7 +260,7 @@ out:
err3:
nf_ct_tmpl_free(ct);
err2:
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
err1:
return ret;
}
@@ -341,7 +341,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (help)
module_put(help->helper->me);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
xt_ct_destroy_timeout(ct);
nf_ct_put(info->ct);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 1763ab82bcd7..c3b2017ebe41 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -32,15 +32,15 @@ static unsigned int
log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
+ struct net *net = xt_net(par);
struct nf_loginfo li;
- struct net *net = par->net;
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = loginfo->level;
li.u.log.logflags = loginfo->logflags;
- nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out,
- &li, "%s", loginfo->prefix);
+ nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", loginfo->prefix);
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index b253e07cb1c5..e45a01255e70 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -33,8 +33,8 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
range->max_addr.ip6[i]);
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT)
+ if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+ xt_hooknum(par) == NF_INET_LOCAL_OUT)
new_addr.in6 = ipv6_hdr(skb)->daddr;
else
new_addr.in6 = ipv6_hdr(skb)->saddr;
@@ -51,7 +51,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
newrange.min_proto = range->min_proto;
newrange.max_proto = range->max_proto;
- return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
}
static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -60,7 +60,12 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
return -EINVAL;
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void netmap_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static unsigned int
@@ -72,16 +77,16 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
struct nf_nat_range newrange;
- NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_POST_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT ||
- par->hooknum == NF_INET_LOCAL_IN);
+ NF_CT_ASSERT(xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+ xt_hooknum(par) == NF_INET_POST_ROUTING ||
+ xt_hooknum(par) == NF_INET_LOCAL_OUT ||
+ xt_hooknum(par) == NF_INET_LOCAL_IN);
ct = nf_ct_get(skb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT)
+ if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+ xt_hooknum(par) == NF_INET_LOCAL_OUT)
new_ip = ip_hdr(skb)->daddr & ~netmask;
else
new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -96,7 +101,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
newrange.max_proto = mr->range[0].max;
/* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
}
static int netmap_tg4_check(const struct xt_tgchk_param *par)
@@ -111,7 +116,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static struct xt_target netmap_tg_reg[] __read_mostly = {
@@ -127,6 +132,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg6_checkentry,
+ .destroy = netmap_tg_destroy,
.me = THIS_MODULE,
},
{
@@ -141,6 +147,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg4_check,
+ .destroy = netmap_tg_destroy,
.me = THIS_MODULE,
},
};
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 018eed7e1ff1..c7f8958cea4a 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -25,19 +25,20 @@ static unsigned int
nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
+ struct net *net = xt_net(par);
struct nf_loginfo li;
- struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
+ li.u.ulog.flags = 0;
if (info->flags & XT_NFLOG_F_COPY_LEN)
li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
- nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
- par->out, &li, info->prefix);
+ nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, info->prefix);
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 8f1779ff7e30..a360b99a958a 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -43,7 +43,7 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
if (info->queues_total > 1) {
queue = nfqueue_hash(skb, queue, info->queues_total,
- par->family, jhash_initval);
+ xt_family(par), jhash_initval);
}
return NF_QUEUE_NR(queue);
}
@@ -98,7 +98,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
queue = info->queuenum + cpu % info->queues_total;
} else {
queue = nfqueue_hash(skb, queue, info->queues_total,
- par->family, jhash_initval);
+ xt_family(par), jhash_initval);
}
}
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 515131f9e021..91a373a3f534 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -24,7 +24,6 @@ static DEFINE_MUTEX(xt_rateest_mutex);
#define RATEEST_HSIZE 16
static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
static unsigned int jhash_rnd __read_mostly;
-static bool rnd_inited __read_mostly;
static unsigned int xt_rateest_hash(const char *name)
{
@@ -64,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
- gen_kill_estimator(&est->bstats, &est->rstats);
+ gen_kill_estimator(&est->rate_est);
/*
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
@@ -99,10 +98,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
} cfg;
int ret;
- if (unlikely(!rnd_inited)) {
- get_random_bytes(&jhash_rnd, sizeof(jhash_rnd));
- rnd_inited = true;
- }
+ net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
est = xt_rateest_lookup(info->name);
if (est) {
@@ -136,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
- ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 03f0b370e178..98a4c6d4f1cb 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -31,7 +31,7 @@
static unsigned int
redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv6(skb, par->targinfo, par->hooknum);
+ return nf_nat_redirect_ipv6(skb, par->targinfo, xt_hooknum(par));
}
static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -40,7 +40,13 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
- return 0;
+
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
/* FIXME: Take multiple ranges --RR */
@@ -56,13 +62,13 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv4(skb, par->targinfo, par->hooknum);
+ return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
}
static struct xt_target redirect_tg_reg[] __read_mostly = {
@@ -72,6 +78,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
.revision = 0,
.table = "nat",
.checkentry = redirect_tg6_checkentry,
+ .destroy = redirect_tg_destroy,
.target = redirect_tg6,
.targetsize = sizeof(struct nf_nat_range),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -85,6 +92,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
.table = "nat",
.target = redirect_tg4,
.checkentry = redirect_tg4_check,
+ .destroy = redirect_tg_destroy,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_OUT),
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index e118397254af..27241a767f17 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -108,20 +108,16 @@ tcpmss_mangle_packet(struct sk_buff *skb,
return -1;
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
- struct net *net = par->net;
+ struct net *net = xt_net(par);
unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
+ unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
- if (dst_mtu(skb_dst(skb)) <= minlen) {
+ if (min_mtu <= minlen) {
net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
- dst_mtu(skb_dst(skb)));
+ min_mtu);
return -1;
}
- if (in_mtu <= minlen) {
- net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
- in_mtu);
- return -1;
- }
- newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
+ newmss = min_mtu - minlen;
} else
newmss = info->mss;
@@ -176,7 +172,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
* length IPv6 header of 60, ergo the default MSS value is 1220
* Since no MSS was provided, we must use the default values
*/
- if (par->family == NFPROTO_IPV4)
+ if (xt_family(par) == NFPROTO_IPV4)
newmss = min(newmss, (u16)536);
else
newmss = min(newmss, (u16)1220);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 6e57a3966dc5..1c57ace75ae6 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -33,7 +33,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tee_tginfo *info = par->targinfo;
int oif = info->priv ? info->priv->oif : 0;
- nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif);
+ nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif);
return XT_CONTINUE;
}
@@ -45,7 +45,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tee_tginfo *info = par->targinfo;
int oif = info->priv ? info->priv->oif : 0;
- nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif);
+ nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif);
return XT_CONTINUE;
}
@@ -89,6 +89,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->oif[0]) {
+ int ret;
+
if (info->oif[sizeof(info->oif)-1] != '\0')
return -EINVAL;
@@ -101,7 +103,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
priv->notifier.notifier_call = tee_netdev_event;
info->priv = priv;
- register_netdevice_notifier(&priv->notifier);
+ ret = register_netdevice_notifier(&priv->notifier);
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
} else
info->priv = NULL;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 663c4c3c9072..80cb7babeb64 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -364,7 +364,8 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
- return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
+ tgi->mark_mask, tgi->mark_value);
}
static unsigned int
@@ -372,7 +373,8 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
- return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
+ tgi->mark_mask, tgi->mark_value);
}
#ifdef XT_TPROXY_HAVE_IPV6
@@ -442,7 +444,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
+ sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
&iph->saddr,
tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
hp->source,
@@ -485,10 +487,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
+ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
- par->in, NFT_LOOKUP_ESTABLISHED);
+ xt_in(par), NFT_LOOKUP_ESTABLISHED);
laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
lport = tgi->lport ? tgi->lport : hp->dest;
@@ -500,10 +502,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp,
+ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
tproto, &iph->saddr, laddr,
hp->source, lport,
- par->in, NFT_LOOKUP_LISTENER);
+ xt_in(par), NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
if (sk && tproxy_sk_is_transparent(sk)) {
@@ -529,6 +531,11 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
static int tproxy_tg6_check(const struct xt_tgchk_param *par)
{
const struct ip6t_ip6 *i = par->entryinfo;
+ int err;
+
+ err = nf_defrag_ipv6_enable(par->net);
+ if (err)
+ return err;
if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
!(i->invflags & IP6T_INV_PROTO))
@@ -543,6 +550,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
static int tproxy_tg4_check(const struct xt_tgchk_param *par)
{
const struct ipt_ip *i = par->entryinfo;
+ int err;
+
+ err = nf_defrag_ipv4_enable(par->net);
+ if (err)
+ return err;
if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
&& !(i->invflags & IPT_INV_PROTO))
@@ -594,11 +606,6 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
static int __init tproxy_tg_init(void)
{
- nf_defrag_ipv4_enable();
-#ifdef XT_TPROXY_HAVE_IPV6
- nf_defrag_ipv6_enable();
-#endif
-
return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 11d6091991a4..e329dabde35f 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
static bool
addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
const struct xt_addrtype_info *info = par->matchinfo;
const struct iphdr *iph = ip_hdr(skb);
bool ret = true;
@@ -143,19 +143,19 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
static bool
addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
const struct xt_addrtype_info_v1 *info = par->matchinfo;
const struct iphdr *iph;
const struct net_device *dev = NULL;
bool ret = true;
if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
- dev = par->in;
+ dev = xt_in(par);
else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
- dev = par->out;
+ dev = xt_out(par);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
- if (par->family == NFPROTO_IPV6)
+ if (xt_family(par) == NFPROTO_IPV6)
return addrtype_mt6(net, dev, skb, info);
#endif
iph = ip_hdr(skb);
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47ec4..2dedaa23ab0a 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/netfilter/xt_bpf.h>
#include <linux/netfilter/x_tables.h>
@@ -20,15 +21,15 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_bpf");
MODULE_ALIAS("ip6t_bpf");
-static int bpf_mt_check(const struct xt_mtchk_param *par)
+static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
+ struct bpf_prog **ret)
{
- struct xt_bpf_info *info = par->matchinfo;
struct sock_fprog_kern program;
- program.len = info->bpf_program_num_elem;
- program.filter = info->bpf_program;
+ program.len = len;
+ program.filter = insns;
- if (bpf_prog_create(&info->filter, &program)) {
+ if (bpf_prog_create(ret, &program)) {
pr_info("bpf: check failed: parse error\n");
return -EINVAL;
}
@@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
return 0;
}
+static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ *ret = prog;
+ return 0;
+}
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+}
+
+static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ if (info->mode == XT_BPF_MODE_BYTECODE)
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+ else if (info->mode == XT_BPF_MODE_FD_PINNED ||
+ info->mode == XT_BPF_MODE_FD_ELF)
+ return __bpf_mt_check_fd(info->fd, &info->filter);
+ else
+ return -EINVAL;
+}
+
static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +80,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
return BPF_PROG_RUN(info->filter, skb);
}
+static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
+}
+
static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
+
+ bpf_prog_destroy(info->filter);
+}
+
+static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
bpf_prog_destroy(info->filter);
}
-static struct xt_match bpf_mt_reg __read_mostly = {
- .name = "bpf",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = bpf_mt_check,
- .match = bpf_mt,
- .destroy = bpf_mt_destroy,
- .matchsize = sizeof(struct xt_bpf_info),
- .me = THIS_MODULE,
+static struct xt_match bpf_mt_reg[] __read_mostly = {
+ {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "bpf",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check_v1,
+ .match = bpf_mt_v1,
+ .destroy = bpf_mt_destroy_v1,
+ .matchsize = sizeof(struct xt_bpf_info_v1),
+ .me = THIS_MODULE,
+ },
};
static int __init bpf_mt_init(void)
{
- return xt_register_match(&bpf_mt_reg);
+ return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
static void __exit bpf_mt_exit(void)
{
- xt_unregister_match(&bpf_mt_reg);
+ xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
module_init(bpf_mt_init);
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 96fa26b20b67..9a9884a39c0e 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -112,7 +112,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
* know, matches should not alter packets, but we are doing this here
* because we would need to add a PKTTYPE target for this sole purpose.
*/
- if (!xt_cluster_is_multicast_addr(skb, par->family) &&
+ if (!xt_cluster_is_multicast_addr(skb, xt_family(par)) &&
skb->pkt_type == PACKET_MULTICAST) {
pskb->pkt_type = PACKET_HOST;
}
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index d4bec261e74e..cad0b7b5eb35 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -110,7 +110,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
sinfo->direction != XT_CONNBYTES_DIR_BOTH)
return -EINVAL;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -129,7 +129,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match connbytes_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 03d66f1c5e69..7827128d5a95 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -61,7 +61,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -70,14 +70,14 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
ret = nf_connlabels_get(par->net, info->bit);
if (ret < 0)
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
return ret;
}
static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
{
nf_connlabels_put(par->net);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match connlabels_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 99bbc829868d..2aff2b7c4689 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -317,7 +317,7 @@ static int count_them(struct net *net,
static bool
connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
const struct xt_connlimit_info *info = par->matchinfo;
union nf_inet_addr addr;
struct nf_conntrack_tuple tuple;
@@ -332,11 +332,11 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
zone = nf_ct_zone(ct);
} else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- par->family, net, &tuple)) {
+ xt_family(par), net, &tuple)) {
goto hotdrop;
}
- if (par->family == NFPROTO_IPV6) {
+ if (xt_family(par) == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
&iph->daddr : &iph->saddr, sizeof(addr.ip6));
@@ -347,7 +347,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
connections = count_them(net, info->data, tuple_ptr, &addr,
- &info->mask, par->family, zone);
+ &info->mask, xt_family(par), zone);
if (connections == 0)
/* kmalloc failed, drop it entirely */
goto hotdrop;
@@ -366,15 +366,9 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
unsigned int i;
int ret;
- if (unlikely(!connlimit_rnd)) {
- u_int32_t rand;
+ net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&connlimit_rnd, 0, rand);
- }
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for "
"address family %u\n", par->family);
@@ -384,7 +378,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
/* init private data */
info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
if (info->data == NULL) {
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
return -ENOMEM;
}
@@ -420,7 +414,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
const struct xt_connlimit_info *info = par->matchinfo;
unsigned int i;
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
destroy_tree(&info->data->climit_root4[i]);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 69f78e96fdb4..9935d5029b0e 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -44,7 +44,7 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
u_int32_t newmark;
ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
+ if (ct == NULL || nf_ct_is_untracked(ct))
return XT_CONTINUE;
switch (info->mode) {
@@ -77,7 +77,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -86,7 +86,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static bool
@@ -97,7 +97,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
+ if (ct == NULL || nf_ct_is_untracked(ct))
return false;
return ((ct->mark & info->mask) == info->mark) ^ info->invert;
@@ -107,7 +107,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -116,7 +116,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target connmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 188404b9b002..c0fb217bc649 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -200,22 +200,22 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
return false;
if (info->match_flags & XT_CONNTRACK_ORIGSRC)
- if (conntrack_mt_origsrc(ct, info, par->family) ^
+ if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_ORIGSRC))
return false;
if (info->match_flags & XT_CONNTRACK_ORIGDST)
- if (conntrack_mt_origdst(ct, info, par->family) ^
+ if (conntrack_mt_origdst(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_ORIGDST))
return false;
if (info->match_flags & XT_CONNTRACK_REPLSRC)
- if (conntrack_mt_replsrc(ct, info, par->family) ^
+ if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_REPLSRC))
return false;
if (info->match_flags & XT_CONNTRACK_REPLDST)
- if (conntrack_mt_repldst(ct, info, par->family) ^
+ if (conntrack_mt_repldst(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_REPLDST))
return false;
@@ -233,10 +233,8 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
return false;
if (info->match_flags & XT_CONNTRACK_EXPIRES) {
- unsigned long expires = 0;
+ unsigned long expires = nf_ct_expires(ct) / HZ;
- if (timer_pending(&ct->timeout))
- expires = (ct->timeout.expires - jiffies) / HZ;
if ((expires >= info->expires_min &&
expires <= info->expires_max) ^
!(info->invert_flags & XT_CONNTRACK_EXPIRES))
@@ -273,7 +271,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -282,7 +280,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match conntrack_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index d9202cdd25c9..96ebe1cdefec 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -24,12 +24,12 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_devgroup_info *info = par->matchinfo;
if (info->flags & XT_DEVGROUP_MATCH_SRC &&
- (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+ (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^
((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
return false;
if (info->flags & XT_DEVGROUP_MATCH_DST &&
- (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+ (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^
((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
return false;
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 64670fc5d0e1..236ac8008909 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -58,7 +58,7 @@ static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_tos_match_info *info = par->matchinfo;
- if (par->family == NFPROTO_IPV4)
+ if (xt_family(par) == NFPROTO_IPV4)
return ((ip_hdr(skb)->tos & info->tos_mask) ==
info->tos_value) ^ !!info->invert;
else
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 178696852bde..10063408141d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -49,13 +49,14 @@ struct hashlimit_net {
struct proc_dir_entry *ip6t_hashlimit;
};
-static int hashlimit_net_id;
+static unsigned int hashlimit_net_id;
static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
{
return net_generic(net, hashlimit_net_id);
}
/* need to declare this at the top */
+static const struct file_operations dl_file_ops_v1;
static const struct file_operations dl_file_ops;
/* hash table crap */
@@ -86,8 +87,8 @@ struct dsthash_ent {
unsigned long expires; /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
- u_int32_t credit;
- u_int32_t credit_cap, cost;
+ u_int64_t credit;
+ u_int64_t credit_cap, cost;
} rateinfo;
struct rcu_head rcu;
};
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
u_int8_t family;
bool rnd_initialized;
- struct hashlimit_cfg1 cfg; /* config */
+ struct hashlimit_cfg2 cfg; /* config */
/* used internally */
spinlock_t lock; /* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
struct hlist_head hash[0]; /* hashtable itself */
};
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+ if (revision == 1) {
+ struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+ to->mode = cfg->mode;
+ to->avg = cfg->avg;
+ to->burst = cfg->burst;
+ to->size = cfg->size;
+ to->max = cfg->max;
+ to->gc_interval = cfg->gc_interval;
+ to->expire = cfg->expire;
+ to->srcmask = cfg->srcmask;
+ to->dstmask = cfg->dstmask;
+ } else if (revision == 2) {
+ memcpy(to, from, sizeof(struct hashlimit_cfg2));
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */
static struct kmem_cache *hashlimit_cachep __read_mostly;
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
}
static void htable_gc(struct work_struct *work);
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
- u_int8_t family)
+static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+ const char *name, u_int8_t family,
+ struct xt_hashlimit_htable **out_hinfo,
+ int revision)
{
struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
struct xt_hashlimit_htable *hinfo;
- unsigned int size;
- unsigned int i;
+ unsigned int size, i;
+ int ret;
- if (minfo->cfg.size) {
- size = minfo->cfg.size;
+ if (cfg->size) {
+ size = cfg->size;
} else {
size = (totalram_pages << PAGE_SHIFT) / 16384 /
sizeof(struct list_head);
@@ -238,10 +265,14 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
sizeof(struct list_head) * size);
if (hinfo == NULL)
return -ENOMEM;
- minfo->hinfo = hinfo;
+ *out_hinfo = hinfo;
/* copy match config into hashtable config */
- memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+
+ if (ret)
+ return ret;
+
hinfo->cfg.size = size;
if (hinfo->cfg.max == 0)
hinfo->cfg.max = 8 * hinfo->cfg.size;
@@ -255,17 +286,18 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
- hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+ hinfo->name = kstrdup(name, GFP_KERNEL);
if (!hinfo->name) {
vfree(hinfo);
return -ENOMEM;
}
spin_lock_init(&hinfo->lock);
- hinfo->pde = proc_create_data(minfo->name, 0,
+ hinfo->pde = proc_create_data(name, 0,
(family == NFPROTO_IPV4) ?
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
- &dl_file_ops, hinfo);
+ (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
+ hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
vfree(hinfo);
@@ -398,7 +430,8 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
(slowest userspace tool allows), which means
CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
*/
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ (0xFFFFFFFFFFFFFFFFULL / (HZ*60*60*24))
/* Repeated shift and or gives us all 1s, final shift and add 1 gives
* us the power of 2 below the theoretical max, so GCC simply does a
@@ -408,9 +441,12 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32))
#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1)
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
/* in byte mode, the lowest possible rate is one packet/second.
* credit_cap is used as a counter that tells us how many times we can
@@ -425,14 +461,25 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
}
/* Precision saver. */
-static u32 user2credits(u32 user)
+static u64 user2credits(u64 user, int revision)
{
- /* If multiplying would overflow... */
- if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
- /* Divide first. */
- return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+ if (revision == 1) {
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
+ /* Divide first. */
+ return div64_u64(user, XT_HASHLIMIT_SCALE)
+ * HZ * CREDITS_PER_JIFFY_v1;
+
+ return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
+ XT_HASHLIMIT_SCALE);
+ } else {
+ if (user > 0xFFFFFFFFFFFFFFFFULL / (HZ*CREDITS_PER_JIFFY))
+ return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
+ * HZ * CREDITS_PER_JIFFY;
- return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+ return div64_u64(user * HZ * CREDITS_PER_JIFFY,
+ XT_HASHLIMIT_SCALE_v2);
+ }
}
static u32 user2credits_byte(u32 user)
@@ -442,10 +489,11 @@ static u32 user2credits_byte(u32 user)
return (u32) (us >> 32);
}
-static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+ u32 mode, int revision)
{
unsigned long delta = now - dh->rateinfo.prev;
- u32 cap;
+ u64 cap, cpj;
if (delta == 0)
return;
@@ -453,7 +501,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
dh->rateinfo.prev = now;
if (mode & XT_HASHLIMIT_BYTES) {
- u32 tmp = dh->rateinfo.credit;
+ u64 tmp = dh->rateinfo.credit;
dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
cap = CREDITS_PER_JIFFY_BYTES * HZ;
if (tmp >= dh->rateinfo.credit) {/* overflow */
@@ -461,7 +509,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
return;
}
} else {
- dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+ cpj = (revision == 1) ?
+ CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
+ dh->rateinfo.credit += delta * cpj;
cap = dh->rateinfo.credit_cap;
}
if (dh->rateinfo.credit > cap)
@@ -469,7 +519,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
}
static void rateinfo_init(struct dsthash_ent *dh,
- struct xt_hashlimit_htable *hinfo)
+ struct xt_hashlimit_htable *hinfo, int revision)
{
dh->rateinfo.prev = jiffies;
if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
@@ -478,8 +528,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
dh->rateinfo.credit_cap = hinfo->cfg.burst;
} else {
dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ hinfo->cfg.burst, revision);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision);
dh->rateinfo.credit_cap = dh->rateinfo.credit;
}
}
@@ -603,15 +653,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
}
static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
+ struct xt_hashlimit_htable *hinfo,
+ const struct hashlimit_cfg2 *cfg, int revision)
{
- const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
- struct xt_hashlimit_htable *hinfo = info->hinfo;
unsigned long now = jiffies;
struct dsthash_ent *dh;
struct dsthash_dst dst;
bool race = false;
- u32 cost;
+ u64 cost;
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
@@ -626,18 +676,18 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
} else if (race) {
/* Already got an entry, update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
} else {
dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_init(dh, hinfo);
+ rateinfo_init(dh, hinfo, revision);
}
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
}
- if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+ if (cfg->mode & XT_HASHLIMIT_BYTES)
cost = hashlimit_byte_cost(skb->len, dh);
else
cost = dh->rateinfo.cost;
@@ -647,84 +697,157 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
dh->rateinfo.credit -= cost;
spin_unlock(&dh->lock);
rcu_read_unlock_bh();
- return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+ return !(cfg->mode & XT_HASHLIMIT_INVERT);
}
spin_unlock(&dh->lock);
rcu_read_unlock_bh();
/* default match is underlimit - so over the limit, we need to invert */
- return info->cfg.mode & XT_HASHLIMIT_INVERT;
+ return cfg->mode & XT_HASHLIMIT_INVERT;
hotdrop:
par->hotdrop = true;
return false;
}
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static bool
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+ struct xt_hashlimit_htable *hinfo = info->hinfo;
+ struct hashlimit_cfg2 cfg = {};
+ int ret;
+
+ ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+ if (ret)
+ return ret;
+
+ return hashlimit_mt_common(skb, par, hinfo, &cfg, 1);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+ struct xt_hashlimit_htable *hinfo = info->hinfo;
+
+ return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+}
+
+static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
+ struct xt_hashlimit_htable **hinfo,
+ struct hashlimit_cfg2 *cfg,
+ const char *name, int revision)
{
struct net *net = par->net;
- struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
int ret;
- if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
- return -EINVAL;
- if (info->name[sizeof(info->name)-1] != '\0')
+ if (cfg->gc_interval == 0 || cfg->expire == 0)
return -EINVAL;
if (par->family == NFPROTO_IPV4) {
- if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+ if (cfg->srcmask > 32 || cfg->dstmask > 32)
return -EINVAL;
} else {
- if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+ if (cfg->srcmask > 128 || cfg->dstmask > 128)
return -EINVAL;
}
- if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+ if (cfg->mode & ~XT_HASHLIMIT_ALL) {
pr_info("Unknown mode mask %X, kernel too old?\n",
- info->cfg.mode);
+ cfg->mode);
return -EINVAL;
}
/* Check for overflow. */
- if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
- if (user2credits_byte(info->cfg.avg) == 0) {
- pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+ if (cfg->mode & XT_HASHLIMIT_BYTES) {
+ if (user2credits_byte(cfg->avg) == 0) {
+ pr_info("overflow, rate too high: %llu\n", cfg->avg);
return -EINVAL;
}
- } else if (info->cfg.burst == 0 ||
- user2credits(info->cfg.avg * info->cfg.burst) <
- user2credits(info->cfg.avg)) {
- pr_info("overflow, try lower: %u/%u\n",
- info->cfg.avg, info->cfg.burst);
+ } else if (cfg->burst == 0 ||
+ user2credits(cfg->avg * cfg->burst, revision) <
+ user2credits(cfg->avg, revision)) {
+ pr_info("overflow, try lower: %llu/%llu\n",
+ cfg->avg, cfg->burst);
return -ERANGE;
}
mutex_lock(&hashlimit_mutex);
- info->hinfo = htable_find_get(net, info->name, par->family);
- if (info->hinfo == NULL) {
- ret = htable_create(net, info, par->family);
+ *hinfo = htable_find_get(net, name, par->family);
+ if (*hinfo == NULL) {
+ ret = htable_create(net, cfg, name, par->family,
+ hinfo, revision);
if (ret < 0) {
mutex_unlock(&hashlimit_mutex);
return ret;
}
}
mutex_unlock(&hashlimit_mutex);
+
return 0;
}
-static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+ struct hashlimit_cfg2 cfg = {};
+ int ret;
+
+ if (info->name[sizeof(info->name) - 1] != '\0')
+ return -EINVAL;
+
+ ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+ if (ret)
+ return ret;
+
+ return hashlimit_mt_check_common(par, &info->hinfo,
+ &cfg, info->name, 1);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+ if (info->name[sizeof(info->name) - 1] != '\0')
+ return -EINVAL;
+
+ return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
+ info->name, 2);
+}
+
+static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
{
const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
htable_put(info->hinfo);
}
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+ htable_put(info->hinfo);
+}
+
static struct xt_match hashlimit_mt_reg[] __read_mostly = {
{
.name = "hashlimit",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = hashlimit_mt,
+ .match = hashlimit_mt_v1,
.matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .checkentry = hashlimit_mt_check_v1,
+ .destroy = hashlimit_mt_destroy_v1,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "hashlimit",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .match = hashlimit_mt,
+ .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
.checkentry = hashlimit_mt_check,
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
@@ -734,8 +857,18 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.name = "hashlimit",
.revision = 1,
.family = NFPROTO_IPV6,
- .match = hashlimit_mt,
+ .match = hashlimit_mt_v1,
.matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .checkentry = hashlimit_mt_check_v1,
+ .destroy = hashlimit_mt_destroy_v1,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "hashlimit",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .match = hashlimit_mt,
+ .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
.checkentry = hashlimit_mt_check,
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
@@ -786,18 +919,12 @@ static void dl_seq_stop(struct seq_file *s, void *v)
spin_unlock_bh(&htable->lock);
}
-static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
- struct seq_file *s)
+static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
{
- const struct xt_hashlimit_htable *ht = s->private;
-
- spin_lock(&ent->lock);
- /* recalculate to show accurate numbers */
- rateinfo_recalc(ent, jiffies, ht->cfg.mode);
-
switch (family) {
case NFPROTO_IPV4:
- seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+ seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip.src,
ntohs(ent->dst.src_port),
@@ -808,7 +935,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
break;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
case NFPROTO_IPV6:
- seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+ seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip6.src,
ntohs(ent->dst.src_port),
@@ -821,10 +948,52 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
default:
BUG();
}
+}
+
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
+{
+ const struct xt_hashlimit_htable *ht = s->private;
+
+ spin_lock(&ent->lock);
+ /* recalculate to show accurate numbers */
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1);
+
+ dl_seq_print(ent, family, s);
+
+ spin_unlock(&ent->lock);
+ return seq_has_overflowed(s);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
+{
+ const struct xt_hashlimit_htable *ht = s->private;
+
+ spin_lock(&ent->lock);
+ /* recalculate to show accurate numbers */
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+ dl_seq_print(ent, family, s);
+
spin_unlock(&ent->lock);
return seq_has_overflowed(s);
}
+static int dl_seq_show_v1(struct seq_file *s, void *v)
+{
+ struct xt_hashlimit_htable *htable = s->private;
+ unsigned int *bucket = (unsigned int *)v;
+ struct dsthash_ent *ent;
+
+ if (!hlist_empty(&htable->hash[*bucket])) {
+ hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+ if (dl_seq_real_show_v1(ent, htable->family, s))
+ return -1;
+ }
+ return 0;
+}
+
static int dl_seq_show(struct seq_file *s, void *v)
{
struct xt_hashlimit_htable *htable = s->private;
@@ -839,6 +1008,13 @@ static int dl_seq_show(struct seq_file *s, void *v)
return 0;
}
+static const struct seq_operations dl_seq_ops_v1 = {
+ .start = dl_seq_start,
+ .next = dl_seq_next,
+ .stop = dl_seq_stop,
+ .show = dl_seq_show_v1
+};
+
static const struct seq_operations dl_seq_ops = {
.start = dl_seq_start,
.next = dl_seq_next,
@@ -846,17 +1022,37 @@ static const struct seq_operations dl_seq_ops = {
.show = dl_seq_show
};
+static int dl_proc_open_v1(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &dl_seq_ops_v1);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ sf->private = PDE_DATA(inode);
+ }
+ return ret;
+}
+
static int dl_proc_open(struct inode *inode, struct file *file)
{
int ret = seq_open(file, &dl_seq_ops);
if (!ret) {
struct seq_file *sf = file->private_data;
+
sf->private = PDE_DATA(inode);
}
return ret;
}
+static const struct file_operations dl_file_ops_v1 = {
+ .owner = THIS_MODULE,
+ .open = dl_proc_open_v1,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
static const struct file_operations dl_file_ops = {
.owner = THIS_MODULE,
.open = dl_proc_open,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 9f4ab00c8050..38a78151c0e9 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -41,7 +41,7 @@ helper_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (!master_help)
return ret;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(master_help->helper);
if (!helper)
return ret;
@@ -59,19 +59,19 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
struct xt_helper_info *info = par->matchinfo;
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
return ret;
}
- info->name[29] = '\0';
+ info->name[sizeof(info->name) - 1] = '\0';
return 0;
}
static void helper_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match helper_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c
index 89d53104c6b3..000e70377f85 100644
--- a/net/netfilter/xt_ipcomp.c
+++ b/net/netfilter/xt_ipcomp.c
@@ -26,6 +26,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fan Du <fan.du@windriver.com>");
MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match");
+MODULE_ALIAS("ipt_ipcomp");
+MODULE_ALIAS("ip6t_ipcomp");
/* Returns 1 if the spi is matched by the range, 0 otherwise */
static inline bool
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 71a9d95e0a81..0fdc89064488 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -48,9 +48,9 @@ static bool
ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_ipvs_mtinfo *data = par->matchinfo;
- struct netns_ipvs *ipvs = net_ipvs(par->net);
+ struct netns_ipvs *ipvs = net_ipvs(xt_net(par));
/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
- const u_int8_t family = par->family;
+ const u_int8_t family = xt_family(par);
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index ac1d3c3d09e7..1cde0e4985b7 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -42,29 +42,43 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
e = minfo->ports[++i];
pr_debug("src or dst matches with %d-%d?\n", s, e);
- if (minfo->flags == XT_MULTIPORT_SOURCE
- && src >= s && src <= e)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_DESTINATION
- && dst >= s && dst <= e)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_EITHER
- && ((dst >= s && dst <= e)
- || (src >= s && src <= e)))
- return true ^ minfo->invert;
+ switch (minfo->flags) {
+ case XT_MULTIPORT_SOURCE:
+ if (src >= s && src <= e)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_DESTINATION:
+ if (dst >= s && dst <= e)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_EITHER:
+ if ((dst >= s && dst <= e) ||
+ (src >= s && src <= e))
+ return true ^ minfo->invert;
+ break;
+ default:
+ break;
+ }
} else {
/* exact port matching */
pr_debug("src or dst matches with %d?\n", s);
- if (minfo->flags == XT_MULTIPORT_SOURCE
- && src == s)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_DESTINATION
- && dst == s)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_EITHER
- && (src == s || dst == s))
- return true ^ minfo->invert;
+ switch (minfo->flags) {
+ case XT_MULTIPORT_SOURCE:
+ if (src == s)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_DESTINATION:
+ if (dst == s)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_EITHER:
+ if (src == s || dst == s)
+ return true ^ minfo->invert;
+ break;
+ default:
+ break;
+ }
}
}
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bea7464cc43f..8107b3eb865f 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -23,7 +23,17 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
par->target->name);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static int xt_nat_checkentry(const struct xt_tgchk_param *par)
+{
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void xt_nat_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static void xt_nat_convert_range(struct nf_nat_range *dst,
@@ -106,6 +116,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
.name = "SNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
+ .destroy = xt_nat_destroy,
.target = xt_snat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
@@ -118,6 +129,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
.name = "DNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
@@ -129,6 +141,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "SNAT",
.revision = 1,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_snat_target_v1,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
@@ -139,6 +153,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "DNAT",
.revision = 1,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v1,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index cf327593852a..cc0518fe598e 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
nfnl_acct_update(skb, info->nfacct);
- overquota = nfnl_acct_overquota(par->net, skb, info->nfacct);
+ overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct);
return overquota == NFACCT_UNDERQUOTA ? false : true;
}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 2455b69b5810..c05fefcec238 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,7 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
- struct net *net = p->net;
+ struct net *net = xt_net(p);
if (!info)
return false;
@@ -326,8 +326,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
fcount++;
if (info->flags & XT_OSF_LOG)
- nf_log_packet(net, p->family, p->hooknum, skb,
- p->in, p->out, NULL,
+ nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
+ xt_in(p), xt_out(p), NULL,
"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
f->genre, f->version, f->subtype,
&ip->saddr, ntohs(tcp->source),
@@ -341,8 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
rcu_read_unlock();
if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(net, p->family, p->hooknum, skb, p->in,
- p->out, NULL,
+ nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
+ xt_out(p), NULL,
"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
&ip->saddr, ntohs(tcp->source),
&ip->daddr, ntohs(tcp->dest));
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index a20e731b5b6c..16477df45b3b 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -63,7 +63,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
struct sock *sk = skb_to_full_sk(skb);
- struct net *net = par->net;
+ struct net *net = xt_net(par);
if (sk == NULL || sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index e5f18988aee0..bb33598e4530 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -107,8 +107,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
- pr_info("using --physdev-out and --physdev-is-out are only"
- "supported in the FORWARD and POSTROUTING chains with"
+ pr_info("using --physdev-out and --physdev-is-out are only "
+ "supported in the FORWARD and POSTROUTING chains with "
"bridged traffic.\n");
if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
return -EINVAL;
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 5b645cb598fc..57efb703ff18 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -30,10 +30,10 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (skb->pkt_type != PACKET_LOOPBACK)
type = skb->pkt_type;
- else if (par->family == NFPROTO_IPV4 &&
+ else if (xt_family(par) == NFPROTO_IPV4 &&
ipv4_is_multicast(ip_hdr(skb)->daddr))
type = PACKET_MULTICAST;
- else if (par->family == NFPROTO_IPV6 &&
+ else if (xt_family(par) == NFPROTO_IPV6 &&
ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
type = PACKET_MULTICAST;
else
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index f23e97bb42d7..2b4ab189bba7 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -116,9 +116,9 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
int ret;
if (info->flags & XT_POLICY_MATCH_IN)
- ret = match_policy_in(skb, info, par->family);
+ ret = match_policy_in(skb, info, xt_family(par));
else
- ret = match_policy_out(skb, info, par->family);
+ ret = match_policy_out(skb, info, xt_family(par));
if (ret < 0)
ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a..1db02f6fca54 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
- struct gnet_stats_rate_est64 *r;
+ struct gnet_stats_rate_est64 sample = {0};
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
- spin_lock_bh(&info->est1->lock);
- r = &info->est1->rstats;
+ gen_estimator_read(&info->est1->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
- pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+ bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+ pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
} else {
- bps1 = r->bps;
- pps1 = r->pps;
+ bps1 = sample.bps;
+ pps1 = sample.pps;
}
- spin_unlock_bh(&info->est1->lock);
if (info->flags & XT_RATEEST_MATCH_ABS) {
bps2 = info->bps2;
pps2 = info->pps2;
} else {
- spin_lock_bh(&info->est2->lock);
- r = &info->est2->rstats;
+ gen_estimator_read(&info->est2->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
- pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+ bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+ pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
} else {
- bps2 = r->bps;
- pps2 = r->pps;
+ bps2 = sample.bps;
+ pps2 = sample.pps;
}
- spin_unlock_bh(&info->est2->lock);
}
switch (info->mode) {
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d725a27743a1..1d89a4eaf841 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -95,7 +95,7 @@ struct recent_net {
#endif
};
-static int recent_net_id __read_mostly;
+static unsigned int recent_net_id __read_mostly;
static inline struct recent_net *recent_pernet(struct net *net)
{
@@ -110,7 +110,6 @@ static const struct file_operations recent_old_fops, recent_mt_fops;
#endif
static u_int32_t hash_rnd __read_mostly;
-static bool hash_rnd_inited __read_mostly;
static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
{
@@ -237,7 +236,7 @@ static void recent_table_flush(struct recent_table *t)
static bool
recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct recent_net *recent_net = recent_pernet(net);
const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
@@ -246,7 +245,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
u_int8_t ttl;
bool ret = info->invert;
- if (par->family == NFPROTO_IPV4) {
+ if (xt_family(par) == NFPROTO_IPV4) {
const struct iphdr *iph = ip_hdr(skb);
if (info->side == XT_RECENT_DEST)
@@ -267,7 +266,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
/* use TTL as seen before forwarding */
- if (par->out != NULL && skb->sk == NULL)
+ if (xt_out(par) != NULL && skb->sk == NULL)
ttl++;
spin_lock_bh(&recent_lock);
@@ -275,12 +274,12 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
- e = recent_entry_lookup(t, &addr_mask, par->family,
+ e = recent_entry_lookup(t, &addr_mask, xt_family(par),
(info->check_set & XT_RECENT_TTL) ? ttl : 0);
if (e == NULL) {
if (!(info->check_set & XT_RECENT_SET))
goto out;
- e = recent_entry_init(t, &addr_mask, par->family, ttl);
+ e = recent_entry_init(t, &addr_mask, xt_family(par), ttl);
if (e == NULL)
par->hotdrop = true;
ret = !ret;
@@ -340,10 +339,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
int ret = -EINVAL;
size_t sz;
- if (unlikely(!hash_rnd_inited)) {
- get_random_bytes(&hash_rnd, sizeof(hash_rnd));
- hash_rnd_inited = true;
- }
+ net_get_random_once(&hash_rnd, sizeof(hash_rnd));
+
if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
pr_info("Unsupported user space flags (%08x)\n",
info->check_set);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ef36a56a02c6..4dedb96d1a06 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb,
++i, offset, sch->type, htons(sch->length),
sch->flags);
#endif
- offset += WORD_ROUND(ntohs(sch->length));
+ offset += SCTP_PAD4(ntohs(sch->length));
pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 5669e5b453f4..64285702afd5 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -55,7 +55,7 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v0 *info = par->matchinfo;
- ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.u.compat.dim,
info->match_set.u.compat.flags, 0, UINT_MAX);
return match_set(info->match_set.index, skb, par, &opt,
@@ -118,7 +118,7 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v1 *info = par->matchinfo;
- ADT_OPT(opt, par->family, info->match_set.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.dim,
info->match_set.flags, 0, UINT_MAX);
if (opt.flags & IPSET_RETURN_NOMATCH)
@@ -184,7 +184,7 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_set_info_match_v3 *info = par->matchinfo;
int ret;
- ADT_OPT(opt, par->family, info->match_set.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -231,7 +231,7 @@ set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_set_info_match_v4 *info = par->matchinfo;
int ret;
- ADT_OPT(opt, par->family, info->match_set.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -259,9 +259,9 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v0 *info = par->targinfo;
- ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.u.compat.dim,
info->add_set.u.compat.flags, 0, UINT_MAX);
- ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.u.compat.dim,
info->del_set.u.compat.flags, 0, UINT_MAX);
if (info->add_set.index != IPSET_INVALID_ID)
@@ -332,9 +332,9 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v1 *info = par->targinfo;
- ADT_OPT(add_opt, par->family, info->add_set.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
info->add_set.flags, 0, UINT_MAX);
- ADT_OPT(del_opt, par->family, info->del_set.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
if (info->add_set.index != IPSET_INVALID_ID)
@@ -401,9 +401,9 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v2 *info = par->targinfo;
- ADT_OPT(add_opt, par->family, info->add_set.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
- ADT_OPT(del_opt, par->family, info->del_set.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
/* Normalize to fit into jiffies */
@@ -423,17 +423,19 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
/* Revision 3 target */
+#define MOPT(opt, member) ((opt).ext.skbinfo.member)
+
static unsigned int
set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
int ret;
- ADT_OPT(add_opt, par->family, info->add_set.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
- ADT_OPT(del_opt, par->family, info->del_set.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
- ADT_OPT(map_opt, par->family, info->map_set.dim,
+ ADT_OPT(map_opt, xt_family(par), info->map_set.dim,
info->map_set.flags, 0, UINT_MAX);
/* Normalize to fit into jiffies */
@@ -453,14 +455,14 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
if (!ret)
return XT_CONTINUE;
if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK)
- skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask))
- ^ (map_opt.ext.skbmark);
+ skb->mark = (skb->mark & ~MOPT(map_opt,skbmarkmask))
+ ^ MOPT(map_opt, skbmark);
if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO)
- skb->priority = map_opt.ext.skbprio;
+ skb->priority = MOPT(map_opt, skbprio);
if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) &&
skb->dev &&
- skb->dev->real_num_tx_queues > map_opt.ext.skbqueue)
- skb_set_queue_mapping(skb, map_opt.ext.skbqueue);
+ skb->dev->real_num_tx_queues > MOPT(map_opt, skbqueue))
+ skb_set_queue_mapping(skb, MOPT(map_opt, skbqueue));
}
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index b10ade272b50..770bbec878f1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,76 +22,14 @@
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-#define XT_SOCKET_HAVE_IPV6 1
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/inet6_hashtables.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
+#include <net/netfilter/nf_socket.h>
#include <linux/netfilter/xt_socket.h>
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-#define XT_SOCKET_HAVE_CONNTRACK 1
-#include <net/netfilter/nf_conntrack.h>
-#endif
-
-static int
-extract_icmp4_fields(const struct sk_buff *skb,
- u8 *protocol,
- __be32 *raddr,
- __be32 *laddr,
- __be16 *rport,
- __be16 *lport)
-{
- unsigned int outside_hdrlen = ip_hdrlen(skb);
- struct iphdr *inside_iph, _inside_iph;
- struct icmphdr *icmph, _icmph;
- __be16 *ports, _ports[2];
-
- icmph = skb_header_pointer(skb, outside_hdrlen,
- sizeof(_icmph), &_icmph);
- if (icmph == NULL)
- return 1;
-
- switch (icmph->type) {
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_REDIRECT:
- case ICMP_TIME_EXCEEDED:
- case ICMP_PARAMETERPROB:
- break;
- default:
- return 1;
- }
-
- inside_iph = skb_header_pointer(skb, outside_hdrlen +
- sizeof(struct icmphdr),
- sizeof(_inside_iph), &_inside_iph);
- if (inside_iph == NULL)
- return 1;
-
- if (inside_iph->protocol != IPPROTO_TCP &&
- inside_iph->protocol != IPPROTO_UDP)
- return 1;
-
- ports = skb_header_pointer(skb, outside_hdrlen +
- sizeof(struct icmphdr) +
- (inside_iph->ihl << 2),
- sizeof(_ports), &_ports);
- if (ports == NULL)
- return 1;
-
- /* the inside IP packet is the one quoted from our side, thus
- * its saddr is the local address */
- *protocol = inside_iph->protocol;
- *laddr = inside_iph->saddr;
- *lport = ports[0];
- *raddr = inside_iph->daddr;
- *rport = ports[1];
-
- return 0;
-}
-
/* "socket" match based redirection (no specific rule)
* ===================================================
*
@@ -111,104 +49,6 @@ extract_icmp4_fields(const struct sk_buff *skb,
* then local services could intercept traffic going through the
* box.
*/
-static struct sock *
-xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
- const u8 protocol,
- const __be32 saddr, const __be32 daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in)
-{
- switch (protocol) {
- case IPPROTO_TCP:
- return inet_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
- in->ifindex);
- case IPPROTO_UDP:
- return udp4_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- }
- return NULL;
-}
-
-static bool xt_socket_sk_is_transparent(struct sock *sk)
-{
- switch (sk->sk_state) {
- case TCP_TIME_WAIT:
- return inet_twsk(sk)->tw_transparent;
-
- case TCP_NEW_SYN_RECV:
- return inet_rsk(inet_reqsk(sk))->no_srccheck;
-
- default:
- return inet_sk(sk)->transparent;
- }
-}
-
-static struct sock *xt_socket_lookup_slow_v4(struct net *net,
- const struct sk_buff *skb,
- const struct net_device *indev)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct sk_buff *data_skb = NULL;
- int doff = 0;
- __be32 uninitialized_var(daddr), uninitialized_var(saddr);
- __be16 uninitialized_var(dport), uninitialized_var(sport);
- u8 uninitialized_var(protocol);
-#ifdef XT_SOCKET_HAVE_CONNTRACK
- struct nf_conn const *ct;
- enum ip_conntrack_info ctinfo;
-#endif
-
- if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
- struct udphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, ip_hdrlen(skb),
- sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return NULL;
-
- protocol = iph->protocol;
- saddr = iph->saddr;
- sport = hp->source;
- daddr = iph->daddr;
- dport = hp->dest;
- data_skb = (struct sk_buff *)skb;
- doff = iph->protocol == IPPROTO_TCP ?
- ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
- ip_hdrlen(skb) + sizeof(*hp);
-
- } else if (iph->protocol == IPPROTO_ICMP) {
- if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
- &sport, &dport))
- return NULL;
- } else {
- return NULL;
- }
-
-#ifdef XT_SOCKET_HAVE_CONNTRACK
- /* Do the lookup with the original socket address in
- * case this is a reply packet of an established
- * SNAT-ted connection.
- */
- ct = nf_ct_get(skb, &ctinfo);
- if (ct && !nf_ct_is_untracked(ct) &&
- ((iph->protocol != IPPROTO_ICMP &&
- ctinfo == IP_CT_ESTABLISHED_REPLY) ||
- (iph->protocol == IPPROTO_ICMP &&
- ctinfo == IP_CT_RELATED_REPLY)) &&
- (ct->status & IPS_SRC_NAT_DONE)) {
-
- daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
- dport = (iph->protocol == IPPROTO_TCP) ?
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
- }
-#endif
-
- return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
- daddr, sport, dport, indev);
-}
-
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
@@ -217,7 +57,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v4(par->net, skb, par->in);
+ sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
if (sk) {
bool wildcard;
bool transparent = true;
@@ -233,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = xt_socket_sk_is_transparent(sk);
+ transparent = nf_sk_is_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent)
@@ -265,132 +105,7 @@ socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
return socket_match(skb, par, par->matchinfo);
}
-#ifdef XT_SOCKET_HAVE_IPV6
-
-static int
-extract_icmp6_fields(const struct sk_buff *skb,
- unsigned int outside_hdrlen,
- int *protocol,
- const struct in6_addr **raddr,
- const struct in6_addr **laddr,
- __be16 *rport,
- __be16 *lport,
- struct ipv6hdr *ipv6_var)
-{
- const struct ipv6hdr *inside_iph;
- struct icmp6hdr *icmph, _icmph;
- __be16 *ports, _ports[2];
- u8 inside_nexthdr;
- __be16 inside_fragoff;
- int inside_hdrlen;
-
- icmph = skb_header_pointer(skb, outside_hdrlen,
- sizeof(_icmph), &_icmph);
- if (icmph == NULL)
- return 1;
-
- if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
- return 1;
-
- inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
- sizeof(*ipv6_var), ipv6_var);
- if (inside_iph == NULL)
- return 1;
- inside_nexthdr = inside_iph->nexthdr;
-
- inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
- sizeof(*ipv6_var),
- &inside_nexthdr, &inside_fragoff);
- if (inside_hdrlen < 0)
- return 1; /* hjm: Packet has no/incomplete transport layer headers. */
-
- if (inside_nexthdr != IPPROTO_TCP &&
- inside_nexthdr != IPPROTO_UDP)
- return 1;
-
- ports = skb_header_pointer(skb, inside_hdrlen,
- sizeof(_ports), &_ports);
- if (ports == NULL)
- return 1;
-
- /* the inside IP packet is the one quoted from our side, thus
- * its saddr is the local address */
- *protocol = inside_nexthdr;
- *laddr = &inside_iph->saddr;
- *lport = ports[0];
- *raddr = &inside_iph->daddr;
- *rport = ports[1];
-
- return 0;
-}
-
-static struct sock *
-xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
- const u8 protocol,
- const struct in6_addr *saddr, const struct in6_addr *daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in)
-{
- switch (protocol) {
- case IPPROTO_TCP:
- return inet6_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
- in->ifindex);
- case IPPROTO_UDP:
- return udp6_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- }
-
- return NULL;
-}
-
-static struct sock *xt_socket_lookup_slow_v6(struct net *net,
- const struct sk_buff *skb,
- const struct net_device *indev)
-{
- __be16 uninitialized_var(dport), uninitialized_var(sport);
- const struct in6_addr *daddr = NULL, *saddr = NULL;
- struct ipv6hdr *iph = ipv6_hdr(skb);
- struct sk_buff *data_skb = NULL;
- int doff = 0;
- int thoff = 0, tproto;
-
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
- pr_debug("unable to find transport header in IPv6 packet, dropping\n");
- return NULL;
- }
-
- if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
- struct udphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return NULL;
-
- saddr = &iph->saddr;
- sport = hp->source;
- daddr = &iph->daddr;
- dport = hp->dest;
- data_skb = (struct sk_buff *)skb;
- doff = tproto == IPPROTO_TCP ?
- thoff + __tcp_hdrlen((struct tcphdr *)hp) :
- thoff + sizeof(*hp);
-
- } else if (tproto == IPPROTO_ICMPV6) {
- struct ipv6hdr ipv6_var;
-
- if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
- &sport, &dport, &ipv6_var))
- return NULL;
- } else {
- return NULL;
- }
-
- return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
- sport, dport, indev);
-}
-
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static bool
socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -399,7 +114,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v6(par->net, skb, par->in);
+ sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
if (sk) {
bool wildcard;
bool transparent = true;
@@ -415,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = xt_socket_sk_is_transparent(sk);
+ transparent = nf_sk_is_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent)
@@ -432,9 +147,28 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
}
#endif
+static int socket_mt_enable_defrag(struct net *net, int family)
+{
+ switch (family) {
+ case NFPROTO_IPV4:
+ return nf_defrag_ipv4_enable(net);
+#ifdef XT_SOCKET_HAVE_IPV6
+ case NFPROTO_IPV6:
+ return nf_defrag_ipv6_enable(net);
+#endif
+ }
+ WARN_ONCE(1, "Unknown family %d\n", family);
+ return 0;
+}
+
static int socket_mt_v1_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+ int err;
+
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V1) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
@@ -446,6 +180,11 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par)
static int socket_mt_v2_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
+ int err;
+
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V2) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
@@ -458,7 +197,11 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo3 *info =
(struct xt_socket_mtinfo3 *)par->matchinfo;
+ int err;
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V3) {
pr_info("unknown flags 0x%x\n",
info->flags & ~XT_SOCKET_FLAGS_V3);
@@ -488,7 +231,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "socket",
.revision = 1,
@@ -512,7 +255,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "socket",
.revision = 2,
@@ -536,7 +279,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "socket",
.revision = 3,
@@ -553,11 +296,6 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
static int __init socket_mt_init(void)
{
- nf_defrag_ipv4_enable();
-#ifdef XT_SOCKET_HAVE_IPV6
- nf_defrag_ipv6_enable();
-#endif
-
return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
}
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index a507922d80cd..5746a33789a5 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -43,7 +43,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -52,7 +52,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
static void state_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match state_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 0ae55a36f492..1b01eec1fbda 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -168,7 +168,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* may happen that the same packet matches both rules if
* it arrived at the right moment before 13:00.
*/
- if (skb->tstamp.tv64 == 0)
+ if (skb->tstamp == 0)
__net_timestamp((struct sk_buff *)skb);
stamp = ktime_to_ns(skb->tstamp);
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 2ec93c5e77bb..d177dd066504 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -60,13 +60,7 @@ struct netlbl_domhsh_walk_arg {
};
/* NetLabel Generic NETLINK CALIPSO family */
-static struct genl_family netlbl_calipso_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_CALIPSO_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_CALIPSO_A_MAX,
-};
+static struct genl_family netlbl_calipso_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
@@ -355,6 +349,16 @@ static const struct genl_ops netlbl_calipso_ops[] = {
},
};
+static struct genl_family netlbl_calipso_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CALIPSO_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CALIPSO_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_calipso_ops,
+ .n_ops = ARRAY_SIZE(netlbl_calipso_ops),
+};
+
/* NetLabel Generic NETLINK Protocol Functions
*/
@@ -368,8 +372,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
*/
int __init netlbl_calipso_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
- netlbl_calipso_ops);
+ return genl_register_family(&netlbl_calipso_gnl_family);
}
static const struct netlbl_calipso_ops *calipso_ops;
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 7fd1104ba900..4149d3e63589 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -59,14 +59,7 @@ struct netlbl_domhsh_walk_arg {
};
/* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_cipsov4_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_CIPSOV4_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_CIPSOV4_A_MAX,
-};
-
+static struct genl_family netlbl_cipsov4_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
[NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
@@ -767,6 +760,16 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
},
};
+static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CIPSOV4_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CIPSOV4_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_cipsov4_ops,
+ .n_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
+};
+
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -781,6 +784,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
*/
int __init netlbl_cipsov4_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
- netlbl_cipsov4_ops);
+ return genl_register_family(&netlbl_cipsov4_gnl_family);
}
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 28c56b95fb7f..ea7c67050792 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1502,10 +1502,7 @@ static int __init netlbl_init(void)
printk(KERN_INFO "NetLabel: Initializing\n");
printk(KERN_INFO "NetLabel: domain hash size = %u\n",
(1 << NETLBL_DOMHSH_BITSIZE));
- printk(KERN_INFO "NetLabel: protocols ="
- " UNLABELED"
- " CIPSOv4"
- "\n");
+ printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n");
ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
if (ret_val != 0)
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index f85d0e07af2d..21e0095b1d14 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -60,13 +60,7 @@ struct netlbl_domhsh_walk_arg {
};
/* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_mgmt_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_MGMT_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_MGMT_A_MAX,
-};
+static struct genl_family netlbl_mgmt_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
@@ -834,6 +828,16 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
},
};
+static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_MGMT_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_MGMT_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_mgmt_genl_ops,
+ .n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
+};
+
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -848,6 +852,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
*/
int __init netlbl_mgmt_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
- netlbl_mgmt_genl_ops);
+ return genl_register_family(&netlbl_mgmt_gnl_family);
}
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 4528cff9138b..22dc1b9d6362 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -123,13 +123,7 @@ static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
static u8 netlabel_unlabel_acceptflg;
/* NetLabel Generic NETLINK unlabeled family */
-static struct genl_family netlbl_unlabel_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_UNLABELED_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_UNLABEL_A_MAX,
-};
+static struct genl_family netlbl_unlabel_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
@@ -1378,6 +1372,16 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
},
};
+static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_UNLABELED_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_UNLABEL_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_unlabel_genl_ops,
+ .n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
+};
+
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -1392,8 +1396,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
*/
int __init netlbl_unlabel_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
- netlbl_unlabel_genl_ops);
+ return genl_register_family(&netlbl_unlabel_gnl_family);
}
/*
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 627f898c05b9..161b628ab2b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -40,7 +40,7 @@
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
@@ -113,7 +113,7 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
-static ATOMIC_NOTIFIER_HEAD(netlink_chain);
+static BLOCKING_NOTIFIER_HEAD(netlink_chain);
static DEFINE_SPINLOCK(netlink_tap_lock);
static struct list_head netlink_tap_all __read_mostly;
@@ -329,7 +329,6 @@ static void netlink_sock_destruct(struct sock *sk)
if (nlk->cb_running) {
if (nlk->cb.done)
nlk->cb.done(&nlk->cb);
-
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
@@ -346,6 +345,14 @@ static void netlink_sock_destruct(struct sock *sk)
WARN_ON(nlk_sk(sk)->groups);
}
+static void netlink_sock_destruct_work(struct work_struct *work)
+{
+ struct netlink_sock *nlk = container_of(work, struct netlink_sock,
+ work);
+
+ sk_free(&nlk->sk);
+}
+
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
* SMP. Look, when several writers sleep and reader wakes them up, all but one
* immediately hit write lock and grab all the cpus. Exclusive sleep solves
@@ -648,8 +655,18 @@ out_module:
static void deferred_put_nlk_sk(struct rcu_head *head)
{
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+ struct sock *sk = &nlk->sk;
+
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (nlk->cb_running && nlk->cb.done) {
+ INIT_WORK(&nlk->work, netlink_sock_destruct_work);
+ schedule_work(&nlk->work);
+ return;
+ }
- sock_put(&nlk->sk);
+ sk_free(sk);
}
static int netlink_release(struct socket *sock)
@@ -694,7 +711,7 @@ static int netlink_release(struct socket *sock)
.protocol = sk->sk_protocol,
.portid = nlk->portid,
};
- atomic_notifier_call_chain(&netlink_chain,
+ blocking_notifier_call_chain(&netlink_chain,
NETLINK_URELEASE, &n);
}
@@ -1832,7 +1849,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
/* Record the max length of recvmsg() calls for future allocations */
nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
- 16384);
+ SKB_WITH_OVERHEAD(32768));
copied = data_skb->len;
if (len < copied) {
@@ -2083,8 +2100,9 @@ static int netlink_dump(struct sock *sk)
if (alloc_min_size < nlk->max_recvmsg_len) {
alloc_size = nlk->max_recvmsg_len;
- skb = alloc_skb(alloc_size, GFP_KERNEL |
- __GFP_NOWARN | __GFP_NORETRY);
+ skb = alloc_skb(alloc_size,
+ (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_NOWARN | __GFP_NORETRY);
}
if (!skb) {
alloc_size = alloc_min_size;
@@ -2486,13 +2504,13 @@ static const struct file_operations netlink_seq_fops = {
int netlink_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&netlink_chain, nb);
+ return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);
int netlink_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&netlink_chain, nb);
+ return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 3cfd6cc60504..4fdb38318977 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -3,6 +3,7 @@
#include <linux/rhashtable.h>
#include <linux/atomic.h>
+#include <linux/workqueue.h>
#include <net/sock.h>
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
@@ -33,6 +34,7 @@ struct netlink_sock {
struct rhash_head node;
struct rcu_head rcu;
+ struct work_struct work;
};
static inline struct netlink_sock *nlk_sk(struct sock *sk)
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8dd836a8dd60..a5546249fb10 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -63,43 +63,74 @@ out_nlmsg_trim:
static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
int protocol, int s_num)
{
+ struct rhashtable_iter *hti = (void *)cb->args[2];
struct netlink_table *tbl = &nl_table[protocol];
- struct rhashtable *ht = &tbl->hash;
- const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
struct net *net = sock_net(skb->sk);
struct netlink_diag_req *req;
struct netlink_sock *nlsk;
struct sock *sk;
- int ret = 0, num = 0, i;
+ int num = 2;
+ int ret = 0;
req = nlmsg_data(cb->nlh);
- for (i = 0; i < htbl->size; i++) {
- struct rhash_head *pos;
+ if (s_num > 1)
+ goto mc_list;
- rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) {
- sk = (struct sock *)nlsk;
+ num--;
- if (!net_eq(sock_net(sk), net))
- continue;
- if (num < s_num) {
- num++;
+ if (!hti) {
+ hti = kmalloc(sizeof(*hti), GFP_KERNEL);
+ if (!hti)
+ return -ENOMEM;
+
+ cb->args[2] = (long)hti;
+ }
+
+ if (!s_num)
+ rhashtable_walk_enter(&tbl->hash, hti);
+
+ ret = rhashtable_walk_start(hti);
+ if (ret == -EAGAIN)
+ ret = 0;
+ if (ret)
+ goto stop;
+
+ while ((nlsk = rhashtable_walk_next(hti))) {
+ if (IS_ERR(nlsk)) {
+ ret = PTR_ERR(nlsk);
+ if (ret == -EAGAIN) {
+ ret = 0;
continue;
}
+ break;
+ }
- if (sk_diag_fill(sk, skb, req,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI,
- sock_i_ino(sk)) < 0) {
- ret = 1;
- goto done;
- }
+ sk = (struct sock *)nlsk;
- num++;
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (sk_diag_fill(sk, skb, req,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ sock_i_ino(sk)) < 0) {
+ ret = 1;
+ break;
}
}
+stop:
+ rhashtable_walk_stop(hti);
+ if (ret)
+ goto done;
+
+ rhashtable_walk_exit(hti);
+ num++;
+
+mc_list:
+ read_lock(&nl_table_lock);
sk_for_each_bound(sk, &tbl->mc_list) {
if (sk_hashed(sk))
continue;
@@ -116,13 +147,14 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
NLM_F_MULTI,
sock_i_ino(sk)) < 0) {
ret = 1;
- goto done;
+ break;
}
num++;
}
+ read_unlock(&nl_table_lock);
+
done:
cb->args[0] = num;
- cb->args[1] = protocol;
return ret;
}
@@ -131,34 +163,40 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_diag_req *req;
int s_num = cb->args[0];
+ int err = 0;
req = nlmsg_data(cb->nlh);
- rcu_read_lock();
- read_lock(&nl_table_lock);
-
if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
int i;
for (i = cb->args[1]; i < MAX_LINKS; i++) {
- if (__netlink_diag_dump(skb, cb, i, s_num))
+ err = __netlink_diag_dump(skb, cb, i, s_num);
+ if (err)
break;
s_num = 0;
}
+ cb->args[1] = i;
} else {
- if (req->sdiag_protocol >= MAX_LINKS) {
- read_unlock(&nl_table_lock);
- rcu_read_unlock();
+ if (req->sdiag_protocol >= MAX_LINKS)
return -ENOENT;
- }
- __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
+ err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
}
- read_unlock(&nl_table_lock);
- rcu_read_unlock();
+ return err < 0 ? err : skb->len;
+}
+
+static int netlink_diag_dump_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *hti = (void *)cb->args[2];
+
+ if (cb->args[0] == 1)
+ rhashtable_walk_exit(hti);
- return skb->len;
+ kfree(hti);
+
+ return 0;
}
static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
@@ -172,6 +210,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = netlink_diag_dump,
+ .done = netlink_diag_dump_done,
};
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
} else
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index a09132a69869..fb6e10fdb217 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -17,6 +17,7 @@
#include <linux/mutex.h>
#include <linux/bitmap.h>
#include <linux/rwsem.h>
+#include <linux/idr.h>
#include <net/sock.h>
#include <net/genetlink.h>
@@ -58,10 +59,8 @@ static void genl_unlock_all(void)
up_write(&cb_lock);
}
-#define GENL_FAM_TAB_SIZE 16
-#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
+static DEFINE_IDR(genl_fam_idr);
-static struct list_head family_ht[GENL_FAM_TAB_SIZE];
/*
* Bitmap of multicast groups that are currently in use.
*
@@ -86,45 +85,29 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
static unsigned long *mc_groups = &mc_group_start;
static unsigned long mc_groups_longs = 1;
-static int genl_ctrl_event(int event, struct genl_family *family,
+static int genl_ctrl_event(int event, const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id);
-static inline unsigned int genl_family_hash(unsigned int id)
+static const struct genl_family *genl_family_find_byid(unsigned int id)
{
- return id & GENL_FAM_TAB_MASK;
+ return idr_find(&genl_fam_idr, id);
}
-static inline struct list_head *genl_family_chain(unsigned int id)
+static const struct genl_family *genl_family_find_byname(char *name)
{
- return &family_ht[genl_family_hash(id)];
-}
-
-static struct genl_family *genl_family_find_byid(unsigned int id)
-{
- struct genl_family *f;
-
- list_for_each_entry(f, genl_family_chain(id), family_list)
- if (f->id == id)
- return f;
-
- return NULL;
-}
-
-static struct genl_family *genl_family_find_byname(char *name)
-{
- struct genl_family *f;
- int i;
+ const struct genl_family *family;
+ unsigned int id;
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
- list_for_each_entry(f, genl_family_chain(i), family_list)
- if (strcmp(f->name, name) == 0)
- return f;
+ idr_for_each_entry(&genl_fam_idr, family, id)
+ if (strcmp(family->name, name) == 0)
+ return family;
return NULL;
}
-static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
+static const struct genl_ops *genl_get_cmd(u8 cmd,
+ const struct genl_family *family)
{
int i;
@@ -135,26 +118,6 @@ static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
return NULL;
}
-/* Of course we are going to have problems once we hit
- * 2^16 alive types, but that can only happen by year 2K
-*/
-static u16 genl_generate_id(void)
-{
- static u16 id_gen_idx = GENL_MIN_ID;
- int i;
-
- for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
- if (id_gen_idx != GENL_ID_VFS_DQUOT &&
- id_gen_idx != GENL_ID_PMCRAID &&
- !genl_family_find_byid(id_gen_idx))
- return id_gen_idx;
- if (++id_gen_idx > GENL_MAX_ID)
- id_gen_idx = GENL_MIN_ID;
- }
-
- return 0;
-}
-
static int genl_allocate_reserve_groups(int n_groups, int *first_id)
{
unsigned long *new_groups;
@@ -295,7 +258,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
return err;
}
-static void genl_unregister_mc_groups(struct genl_family *family)
+static void genl_unregister_mc_groups(const struct genl_family *family)
{
struct net *net;
int i;
@@ -344,28 +307,21 @@ static int genl_validate_ops(const struct genl_family *family)
}
/**
- * __genl_register_family - register a generic netlink family
+ * genl_register_family - register a generic netlink family
* @family: generic netlink family
*
* Registers the specified family after validating it first. Only one
* family may be registered with the same family name or identifier.
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
*
- * The family's ops array must already be assigned, you can use the
- * genl_register_family_with_ops() helper function.
+ * The family's ops, multicast groups and module pointer must already
+ * be assigned.
*
* Return 0 on success or a negative error code.
*/
-int __genl_register_family(struct genl_family *family)
+int genl_register_family(struct genl_family *family)
{
- int err = -EINVAL, i;
-
- if (family->id && family->id < GENL_MIN_ID)
- goto errout;
-
- if (family->id > GENL_MAX_ID)
- goto errout;
+ int err, i;
+ int start = GENL_START_ALLOC, end = GENL_MAX_ID;
err = genl_validate_ops(family);
if (err)
@@ -378,18 +334,20 @@ int __genl_register_family(struct genl_family *family)
goto errout_locked;
}
- if (family->id == GENL_ID_GENERATE) {
- u16 newid = genl_generate_id();
-
- if (!newid) {
- err = -ENOMEM;
- goto errout_locked;
- }
-
- family->id = newid;
- } else if (genl_family_find_byid(family->id)) {
- err = -EEXIST;
- goto errout_locked;
+ /*
+ * Sadly, a few cases need to be special-cased
+ * due to them having previously abused the API
+ * and having used their family ID also as their
+ * multicast group ID, so we use reserved IDs
+ * for both to be sure we can do that mapping.
+ */
+ if (family == &genl_ctrl) {
+ /* and this needs to be special for initial family lookups */
+ start = end = GENL_ID_CTRL;
+ } else if (strcmp(family->name, "pmcraid") == 0) {
+ start = end = GENL_ID_PMCRAID;
+ } else if (strcmp(family->name, "VFS_DQUOT") == 0) {
+ start = end = GENL_ID_VFS_DQUOT;
}
if (family->maxattr && !family->parallel_ops) {
@@ -402,11 +360,17 @@ int __genl_register_family(struct genl_family *family)
} else
family->attrbuf = NULL;
+ family->id = idr_alloc(&genl_fam_idr, family,
+ start, end + 1, GFP_KERNEL);
+ if (family->id < 0) {
+ err = family->id;
+ goto errout_locked;
+ }
+
err = genl_validate_assign_mc_groups(family);
if (err)
- goto errout_locked;
+ goto errout_remove;
- list_add_tail(&family->family_list, genl_family_chain(family->id));
genl_unlock_all();
/* send all events */
@@ -417,12 +381,14 @@ int __genl_register_family(struct genl_family *family)
return 0;
+errout_remove:
+ idr_remove(&genl_fam_idr, family->id);
+ kfree(family->attrbuf);
errout_locked:
genl_unlock_all();
-errout:
return err;
}
-EXPORT_SYMBOL(__genl_register_family);
+EXPORT_SYMBOL(genl_register_family);
/**
* genl_unregister_family - unregister generic netlink family
@@ -432,33 +398,29 @@ EXPORT_SYMBOL(__genl_register_family);
*
* Returns 0 on success or a negative error code.
*/
-int genl_unregister_family(struct genl_family *family)
+int genl_unregister_family(const struct genl_family *family)
{
- struct genl_family *rc;
-
genl_lock_all();
- list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
- if (family->id != rc->id || strcmp(rc->name, family->name))
- continue;
+ if (!genl_family_find_byid(family->id)) {
+ genl_unlock_all();
+ return -ENOENT;
+ }
- genl_unregister_mc_groups(family);
+ genl_unregister_mc_groups(family);
- list_del(&rc->family_list);
- family->n_ops = 0;
- up_write(&cb_lock);
- wait_event(genl_sk_destructing_waitq,
- atomic_read(&genl_sk_destructing_cnt) == 0);
- genl_unlock();
+ idr_remove(&genl_fam_idr, family->id);
- kfree(family->attrbuf);
- genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
- return 0;
- }
+ up_write(&cb_lock);
+ wait_event(genl_sk_destructing_waitq,
+ atomic_read(&genl_sk_destructing_cnt) == 0);
+ genl_unlock();
- genl_unlock_all();
+ kfree(family->attrbuf);
+
+ genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
- return -ENOENT;
+ return 0;
}
EXPORT_SYMBOL(genl_unregister_family);
@@ -474,7 +436,7 @@ EXPORT_SYMBOL(genl_unregister_family);
* Returns pointer to user specific header
*/
void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
- struct genl_family *family, int flags, u8 cmd)
+ const struct genl_family *family, int flags, u8 cmd)
{
struct nlmsghdr *nlh;
struct genlmsghdr *hdr;
@@ -533,7 +495,7 @@ static int genl_lock_done(struct netlink_callback *cb)
return rc;
}
-static int genl_family_rcv_msg(struct genl_family *family,
+static int genl_family_rcv_msg(const struct genl_family *family,
struct sk_buff *skb,
struct nlmsghdr *nlh)
{
@@ -645,7 +607,7 @@ out:
static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct genl_family *family;
+ const struct genl_family *family;
int err;
family = genl_family_find_byid(nlh->nlmsg_type);
@@ -674,15 +636,9 @@ static void genl_rcv(struct sk_buff *skb)
* Controller
**************************************************************************/
-static struct genl_family genl_ctrl = {
- .id = GENL_ID_CTRL,
- .name = "nlctrl",
- .version = 0x2,
- .maxattr = CTRL_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family genl_ctrl;
-static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
+static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
u32 flags, struct sk_buff *skb, u8 cmd)
{
void *hdr;
@@ -769,7 +725,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ctrl_fill_mcgrp_info(struct genl_family *family,
+static int ctrl_fill_mcgrp_info(const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id, u32 portid, u32 seq, u32 flags,
struct sk_buff *skb, u8 cmd)
@@ -812,37 +768,30 @@ nla_put_failure:
static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
{
-
- int i, n = 0;
+ int n = 0;
struct genl_family *rt;
struct net *net = sock_net(skb->sk);
- int chains_to_skip = cb->args[0];
- int fams_to_skip = cb->args[1];
-
- for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) {
- n = 0;
- list_for_each_entry(rt, genl_family_chain(i), family_list) {
- if (!rt->netnsok && !net_eq(net, &init_net))
- continue;
- if (++n < fams_to_skip)
- continue;
- if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- skb, CTRL_CMD_NEWFAMILY) < 0)
- goto errout;
- }
+ int fams_to_skip = cb->args[0];
+ unsigned int id;
- fams_to_skip = 0;
- }
+ idr_for_each_entry(&genl_fam_idr, rt, id) {
+ if (!rt->netnsok && !net_eq(net, &init_net))
+ continue;
+
+ if (n++ < fams_to_skip)
+ continue;
-errout:
- cb->args[0] = i;
- cb->args[1] = n;
+ if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, CTRL_CMD_NEWFAMILY) < 0)
+ break;
+ }
+ cb->args[0] = n;
return skb->len;
}
-static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
+static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family,
u32 portid, int seq, u8 cmd)
{
struct sk_buff *skb;
@@ -862,7 +811,7 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
}
static struct sk_buff *
-ctrl_build_mcgrp_msg(struct genl_family *family,
+ctrl_build_mcgrp_msg(const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id, u32 portid, int seq, u8 cmd)
{
@@ -892,7 +841,7 @@ static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *msg;
- struct genl_family *res = NULL;
+ const struct genl_family *res = NULL;
int err = -EINVAL;
if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
@@ -936,7 +885,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
return genlmsg_reply(msg, info);
}
-static int genl_ctrl_event(int event, struct genl_family *family,
+static int genl_ctrl_event(int event, const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id)
{
@@ -977,7 +926,7 @@ static int genl_ctrl_event(int event, struct genl_family *family,
return 0;
}
-static struct genl_ops genl_ctrl_ops[] = {
+static const struct genl_ops genl_ctrl_ops[] = {
{
.cmd = CTRL_CMD_GETFAMILY,
.doit = ctrl_getfamily,
@@ -986,31 +935,43 @@ static struct genl_ops genl_ctrl_ops[] = {
},
};
-static struct genl_multicast_group genl_ctrl_groups[] = {
+static const struct genl_multicast_group genl_ctrl_groups[] = {
{ .name = "notify", },
};
+static struct genl_family genl_ctrl __ro_after_init = {
+ .module = THIS_MODULE,
+ .ops = genl_ctrl_ops,
+ .n_ops = ARRAY_SIZE(genl_ctrl_ops),
+ .mcgrps = genl_ctrl_groups,
+ .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
+ .id = GENL_ID_CTRL,
+ .name = "nlctrl",
+ .version = 0x2,
+ .maxattr = CTRL_ATTR_MAX,
+ .netnsok = true,
+};
+
static int genl_bind(struct net *net, int group)
{
- int i, err = -ENOENT;
+ struct genl_family *f;
+ int err = -ENOENT;
+ unsigned int id;
down_read(&cb_lock);
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
- struct genl_family *f;
-
- list_for_each_entry(f, genl_family_chain(i), family_list) {
- if (group >= f->mcgrp_offset &&
- group < f->mcgrp_offset + f->n_mcgrps) {
- int fam_grp = group - f->mcgrp_offset;
-
- if (!f->netnsok && net != &init_net)
- err = -ENOENT;
- else if (f->mcast_bind)
- err = f->mcast_bind(net, fam_grp);
- else
- err = 0;
- break;
- }
+
+ idr_for_each_entry(&genl_fam_idr, f, id) {
+ if (group >= f->mcgrp_offset &&
+ group < f->mcgrp_offset + f->n_mcgrps) {
+ int fam_grp = group - f->mcgrp_offset;
+
+ if (!f->netnsok && net != &init_net)
+ err = -ENOENT;
+ else if (f->mcast_bind)
+ err = f->mcast_bind(net, fam_grp);
+ else
+ err = 0;
+ break;
}
}
up_read(&cb_lock);
@@ -1020,21 +981,19 @@ static int genl_bind(struct net *net, int group)
static void genl_unbind(struct net *net, int group)
{
- int i;
+ struct genl_family *f;
+ unsigned int id;
down_read(&cb_lock);
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
- struct genl_family *f;
- list_for_each_entry(f, genl_family_chain(i), family_list) {
- if (group >= f->mcgrp_offset &&
- group < f->mcgrp_offset + f->n_mcgrps) {
- int fam_grp = group - f->mcgrp_offset;
+ idr_for_each_entry(&genl_fam_idr, f, id) {
+ if (group >= f->mcgrp_offset &&
+ group < f->mcgrp_offset + f->n_mcgrps) {
+ int fam_grp = group - f->mcgrp_offset;
- if (f->mcast_unbind)
- f->mcast_unbind(net, fam_grp);
- break;
- }
+ if (f->mcast_unbind)
+ f->mcast_unbind(net, fam_grp);
+ break;
}
}
up_read(&cb_lock);
@@ -1074,13 +1033,9 @@ static struct pernet_operations genl_pernet_ops = {
static int __init genl_init(void)
{
- int i, err;
-
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
- INIT_LIST_HEAD(&family_ht[i]);
+ int err;
- err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops,
- genl_ctrl_groups);
+ err = genl_register_family(&genl_ctrl);
if (err < 0)
goto problem;
@@ -1096,6 +1051,25 @@ problem:
subsys_initcall(genl_init);
+/**
+ * genl_family_attrbuf - return family's attrbuf
+ * @family: the family
+ *
+ * Return the family's attrbuf, while validating that it's
+ * actually valid to access it.
+ *
+ * You cannot use this function with a family that has parallel_ops
+ * and you can only use it within (pre/post) doit/dumpit callbacks.
+ */
+struct nlattr **genl_family_attrbuf(const struct genl_family *family)
+{
+ if (!WARN_ON(family->parallel_ops))
+ lockdep_assert_held(&genl_mutex);
+
+ return family->attrbuf;
+}
+EXPORT_SYMBOL(genl_family_attrbuf);
+
static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
gfp_t flags)
{
@@ -1125,8 +1099,9 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
return err;
}
-int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
- u32 portid, unsigned int group, gfp_t flags)
+int genlmsg_multicast_allns(const struct genl_family *family,
+ struct sk_buff *skb, u32 portid,
+ unsigned int group, gfp_t flags)
{
if (WARN_ON_ONCE(group >= family->n_mcgrps))
return -EINVAL;
@@ -1135,7 +1110,7 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
}
EXPORT_SYMBOL(genlmsg_multicast_allns);
-void genl_notify(struct genl_family *family, struct sk_buff *skb,
+void genl_notify(const struct genl_family *family, struct sk_buff *skb,
struct genl_info *info, u32 group, gfp_t flags)
{
struct net *net = genl_info_net(info);
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index ea023b35f1c2..03f3d5c7beb8 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -38,14 +38,7 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
{ .name = NFC_GENL_MCAST_EVENT_NAME, },
};
-static struct genl_family nfc_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NFC_GENL_NAME,
- .version = NFC_GENL_VERSION,
- .maxattr = NFC_ATTR_MAX,
-};
-
+static struct genl_family nfc_genl_family;
static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
[NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
[NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
@@ -120,21 +113,20 @@ nla_put_failure:
static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
{
+ struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family);
struct nfc_dev *dev;
int rc;
u32 idx;
rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize,
- nfc_genl_family.attrbuf,
- nfc_genl_family.maxattr,
- nfc_genl_policy);
+ attrbuf, nfc_genl_family.maxattr, nfc_genl_policy);
if (rc < 0)
return ERR_PTR(rc);
- if (!nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX])
+ if (!attrbuf[NFC_ATTR_DEVICE_INDEX])
return ERR_PTR(-EINVAL);
- idx = nla_get_u32(nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX]);
+ idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]);
dev = nfc_get_device(idx);
if (!dev)
@@ -1754,6 +1746,18 @@ static const struct genl_ops nfc_genl_ops[] = {
},
};
+static struct genl_family nfc_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NFC_GENL_NAME,
+ .version = NFC_GENL_VERSION,
+ .maxattr = NFC_ATTR_MAX,
+ .module = THIS_MODULE,
+ .ops = nfc_genl_ops,
+ .n_ops = ARRAY_SIZE(nfc_genl_ops),
+ .mcgrps = nfc_genl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps),
+};
+
struct urelease_work {
struct work_struct w;
@@ -1839,9 +1843,7 @@ int __init nfc_genl_init(void)
{
int rc;
- rc = genl_register_family_with_ops_groups(&nfc_genl_family,
- nfc_genl_ops,
- nfc_genl_mcgrps);
+ rc = genl_register_family(&nfc_genl_family);
if (rc)
return rc;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1ecbd7715f6d..514f7bcf7c63 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -62,15 +62,19 @@ struct ovs_frag_data {
struct vport *vport;
struct ovs_skb_cb cb;
__be16 inner_protocol;
- __u16 vlan_tci;
+ u16 network_offset; /* valid only for MPLS */
+ u16 vlan_tci;
__be16 vlan_proto;
unsigned int l2_len;
+ u8 mac_proto;
u8 l2_data[MAX_L2_LEN];
};
static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
#define DEFERRED_ACTION_FIFO_SIZE 10
+#define OVS_RECURSION_LIMIT 5
+#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
struct action_fifo {
int head;
int tail;
@@ -78,7 +82,12 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
};
+struct recirc_keys {
+ struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+};
+
static struct action_fifo __percpu *action_fifos;
+static struct recirc_keys __percpu *recirc_keys;
static DEFINE_PER_CPU(int, exec_actions_level);
static void action_fifo_init(struct action_fifo *fifo)
@@ -129,12 +138,12 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
static void invalidate_flow_key(struct sw_flow_key *key)
{
- key->eth.type = htons(0);
+ key->mac_proto |= SW_FLOW_KEY_INVALID;
}
static bool is_flow_key_valid(const struct sw_flow_key *key)
{
- return !!key->eth.type;
+ return !(key->mac_proto & SW_FLOW_KEY_INVALID);
}
static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
@@ -153,7 +162,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_mpls *mpls)
{
- __be32 *new_mpls_lse;
+ struct mpls_shim_hdr *new_mpls_lse;
/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
if (skb->encapsulation)
@@ -162,19 +171,24 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
if (skb_cow_head(skb, MPLS_HLEN) < 0)
return -ENOMEM;
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb->mac_len);
+ skb_set_inner_protocol(skb, skb->protocol);
+ }
+
skb_push(skb, MPLS_HLEN);
memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
skb->mac_len);
skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->mac_len);
- new_mpls_lse = (__be32 *)skb_mpls_header(skb);
- *new_mpls_lse = mpls->mpls_lse;
+ new_mpls_lse = mpls_hdr(skb);
+ new_mpls_lse->label_stack_entry = mpls->mpls_lse;
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
- update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
- if (!skb->inner_protocol)
- skb_set_inner_protocol(skb, skb->protocol);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
+ update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
skb->protocol = mpls->mpls_ethertype;
invalidate_flow_key(key);
@@ -184,26 +198,30 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const __be16 ethertype)
{
- struct ethhdr *hdr;
int err;
err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
if (unlikely(err))
return err;
- skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN);
+ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
skb->mac_len);
__skb_pull(skb, MPLS_HLEN);
skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->mac_len);
- /* skb_mpls_header() is used to locate the ethertype
- * field correctly in the presence of VLAN tags.
- */
- hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
- update_ethertype(skb, hdr, ethertype);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
+ struct ethhdr *hdr;
+
+ /* mpls_hdr() is used to locate the ethertype field correctly in the
+ * presence of VLAN tags.
+ */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ update_ethertype(skb, hdr, ethertype);
+ }
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
@@ -214,7 +232,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
const __be32 *mpls_lse, const __be32 *mask)
{
- __be32 *stack;
+ struct mpls_shim_hdr *stack;
__be32 lse;
int err;
@@ -222,16 +240,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
if (unlikely(err))
return err;
- stack = (__be32 *)skb_mpls_header(skb);
- lse = OVS_MASKED(*stack, *mpls_lse, *mask);
+ stack = mpls_hdr(skb);
+ lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
if (skb->ip_summed == CHECKSUM_COMPLETE) {
- __be32 diff[] = { ~(*stack), lse };
+ __be32 diff[] = { ~(stack->label_stack_entry), lse };
skb->csum = ~csum_partial((char *)diff, sizeof(diff),
~skb->csum);
}
- *stack = lse;
+ stack->label_stack_entry = lse;
flow_key->mpls.top_lse = lse;
return 0;
}
@@ -241,20 +259,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
int err;
err = skb_vlan_pop(skb);
- if (skb_vlan_tag_present(skb))
+ if (skb_vlan_tag_present(skb)) {
invalidate_flow_key(key);
- else
- key->eth.tci = 0;
+ } else {
+ key->eth.vlan.tci = 0;
+ key->eth.vlan.tpid = 0;
+ }
return err;
}
static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_vlan *vlan)
{
- if (skb_vlan_tag_present(skb))
+ if (skb_vlan_tag_present(skb)) {
invalidate_flow_key(key);
- else
- key->eth.tci = vlan->vlan_tci;
+ } else {
+ key->eth.vlan.tci = vlan->vlan_tci;
+ key->eth.vlan.tpid = vlan->vlan_tpid;
+ }
return skb_vlan_push(skb, vlan->vlan_tpid,
ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
}
@@ -295,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
+/* pop_eth does not support VLAN packets as this action is never called
+ * for them.
+ */
+static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ /* safe right before invalidate_flow_key */
+ key->mac_proto = MAC_PROTO_NONE;
+ invalidate_flow_key(key);
+ return 0;
+}
+
+static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct ovs_action_push_eth *ethh)
+{
+ struct ethhdr *hdr;
+
+ /* Add the new Ethernet header */
+ if (skb_cow_head(skb, ETH_HLEN) < 0)
+ return -ENOMEM;
+
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ hdr = eth_hdr(skb);
+ ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
+ ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
+ hdr->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, hdr, ETH_HLEN);
+
+ /* safe right before invalidate_flow_key */
+ key->mac_proto = MAC_PROTO_ETHERNET;
+ invalidate_flow_key(key);
+ return 0;
+}
+
static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
__be32 addr, __be32 new_addr)
{
@@ -650,7 +713,13 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
skb_postpush_rcsum(skb, skb->data, data->l2_len);
skb_reset_mac_header(skb);
- ovs_vport_send(vport, skb);
+ if (eth_p_mpls(skb->protocol)) {
+ skb->inner_network_header = skb->network_header;
+ skb_set_network_header(skb, data->network_offset);
+ skb_reset_mac_len(skb);
+ }
+
+ ovs_vport_send(vport, skb, data->mac_proto);
return 0;
}
@@ -668,7 +737,8 @@ static struct dst_ops ovs_dst_ops = {
/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
* ovs_vport_output(), which is called once per fragmented packet.
*/
-static void prepare_frag(struct vport *vport, struct sk_buff *skb)
+static void prepare_frag(struct vport *vport, struct sk_buff *skb,
+ u16 orig_network_offset, u8 mac_proto)
{
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
@@ -678,8 +748,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
data->vport = vport;
data->cb = *OVS_CB(skb);
data->inner_protocol = skb->inner_protocol;
+ data->network_offset = orig_network_offset;
data->vlan_tci = skb->vlan_tci;
data->vlan_proto = skb->vlan_proto;
+ data->mac_proto = mac_proto;
data->l2_len = hlen;
memcpy(&data->l2_data, skb->data, hlen);
@@ -688,18 +760,27 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
}
static void ovs_fragment(struct net *net, struct vport *vport,
- struct sk_buff *skb, u16 mru, __be16 ethertype)
+ struct sk_buff *skb, u16 mru,
+ struct sw_flow_key *key)
{
+ u16 orig_network_offset = 0;
+
+ if (eth_p_mpls(skb->protocol)) {
+ orig_network_offset = skb_network_offset(skb);
+ skb->network_header = skb->inner_network_header;
+ }
+
if (skb_network_offset(skb) > MAX_L2_LEN) {
OVS_NLERR(1, "L2 header too long to fragment");
goto err;
}
- if (ethertype == htons(ETH_P_IP)) {
+ if (key->eth.type == htons(ETH_P_IP)) {
struct dst_entry ovs_dst;
unsigned long orig_dst;
- prepare_frag(vport, skb);
+ prepare_frag(vport, skb, orig_network_offset,
+ ovs_key_mac_proto(key));
dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_dst.dev = vport->dev;
@@ -710,7 +791,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
- } else if (ethertype == htons(ETH_P_IPV6)) {
+ } else if (key->eth.type == htons(ETH_P_IPV6)) {
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
@@ -719,7 +800,8 @@ static void ovs_fragment(struct net *net, struct vport *vport,
goto err;
}
- prepare_frag(vport, skb);
+ prepare_frag(vport, skb, orig_network_offset,
+ ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -733,7 +815,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
- ovs_vport_name(vport), ntohs(ethertype), mru,
+ ovs_vport_name(vport), ntohs(key->eth.type), mru,
vport->dev->mtu);
goto err;
}
@@ -753,26 +835,19 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
u32 cutlen = OVS_CB(skb)->cutlen;
if (unlikely(cutlen > 0)) {
- if (skb->len - cutlen > ETH_HLEN)
+ if (skb->len - cutlen > ovs_mac_header_len(key))
pskb_trim(skb, skb->len - cutlen);
else
- pskb_trim(skb, ETH_HLEN);
+ pskb_trim(skb, ovs_mac_header_len(key));
}
- if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
- ovs_vport_send(vport, skb);
+ if (likely(!mru ||
+ (skb->len <= mru + vport->dev->hard_header_len))) {
+ ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
} else if (mru <= vport->dev->mtu) {
struct net *net = read_pnet(&dp->net);
- __be16 ethertype = key->eth.type;
- if (!is_flow_key_valid(key)) {
- if (eth_p_mpls(skb->protocol))
- ethertype = skb->inner_protocol;
- else
- ethertype = vlan_get_protocol(skb);
- }
-
- ovs_fragment(net, vport, skb, mru, ethertype);
+ ovs_fragment(net, vport, skb, mru, key);
} else {
kfree_skb(skb);
}
@@ -1011,6 +1086,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *a, int rem)
{
struct deferred_action *da;
+ int level;
if (!is_flow_key_valid(key)) {
int err;
@@ -1034,6 +1110,18 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
return 0;
}
+ level = this_cpu_read(exec_actions_level);
+ if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+ struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
+ struct sw_flow_key *recirc_key = &rks->key[level - 1];
+
+ *recirc_key = *key;
+ recirc_key->recirc_id = nla_get_u32(a);
+ ovs_dp_process_packet(skb, recirc_key);
+
+ return 0;
+ }
+
da = add_deferred_actions(skb, key, NULL);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
@@ -1153,6 +1241,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
if (err)
return err == -EINPROGRESS ? 0 : err;
break;
+
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ err = push_eth(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_ETH:
+ err = pop_eth(skb, key);
+ break;
}
if (unlikely(err)) {
@@ -1200,11 +1296,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
- static const int ovs_recursion_limit = 5;
int err, level;
level = __this_cpu_inc_return(exec_actions_level);
- if (unlikely(level > ovs_recursion_limit)) {
+ if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
ovs_dp_name(dp));
kfree_skb(skb);
@@ -1229,10 +1324,17 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
+ recirc_keys = alloc_percpu(struct recirc_keys);
+ if (!recirc_keys) {
+ free_percpu(action_fifos);
+ return -ENOMEM;
+ }
+
return 0;
}
void action_fifos_exit(void)
{
free_percpu(action_fifos);
+ free_percpu(recirc_keys);
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e054a748ff25..54253ea5976e 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -370,8 +370,11 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
skb_orphan(skb);
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
- if (err)
+ if (err) {
+ if (err != -EINPROGRESS)
+ kfree_skb(skb);
return err;
+ }
key->ip.proto = ipv6_hdr(skb)->nexthdr;
ovs_cb.mru = IP6CB(skb)->frag_max_size;
@@ -511,7 +514,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
int hooknum, nh_off, err = NF_ACCEPT;
nh_off = skb_network_offset(skb);
- skb_pull(skb, nh_off);
+ skb_pull_rcsum(skb, nh_off);
/* See HOOK2MANIP(). */
if (maniptype == NF_NAT_MANIP_SRC)
@@ -576,6 +579,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
err = nf_nat_packet(ct, ctinfo, hooknum, skb);
push:
skb_push(skb, nh_off);
+ skb_postpush_rcsum(skb, skb->data, nh_off);
return err;
}
@@ -725,12 +729,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
skb->nfctinfo = IP_CT_NEW;
}
- /* Repeat if requested, see nf_iterate(). */
- do {
- err = nf_conntrack_in(net, info->family,
- NF_INET_PRE_ROUTING, skb);
- } while (err == NF_REPEAT);
-
+ err = nf_conntrack_in(net, info->family,
+ NF_INET_PRE_ROUTING, skb);
if (err != NF_ACCEPT)
return -ENOENT;
@@ -887,7 +887,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
/* The conntrack module expects to be working at L3. */
nh_ofs = skb_network_offset(skb);
- skb_pull(skb, nh_ofs);
+ skb_pull_rcsum(skb, nh_ofs);
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb);
@@ -901,6 +901,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
err = ovs_ct_lookup(net, key, info, skb);
skb_push(skb, nh_ofs);
+ skb_postpush_rcsum(skb, skb->data, nh_ofs);
if (err)
kfree_skb(skb);
return err;
@@ -1367,7 +1368,7 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
if (ct_info->helper)
module_put(ct_info->helper->me);
if (ct_info->ct)
- nf_ct_put(ct_info->ct);
+ nf_ct_tmpl_free(ct_info->ct);
}
void ovs_ct_init(struct net *net)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 524c0fd3078e..9c62b6325f7a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -58,8 +58,7 @@
#include "vport-internal_dev.h"
#include "vport-netdev.h"
-int ovs_net_id __read_mostly;
-EXPORT_SYMBOL_GPL(ovs_net_id);
+unsigned int ovs_net_id __read_mostly;
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
@@ -131,7 +130,6 @@ int lockdep_ovsl_is_held(void)
else
return 1;
}
-EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
#endif
static struct vport *new_vport(const struct vport_parms *);
@@ -562,7 +560,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct datapath *dp;
- struct ethhdr *eth;
struct vport *input_vport;
u16 mru = 0;
int len;
@@ -583,17 +580,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
- skb_reset_mac_header(packet);
- eth = eth_hdr(packet);
-
- /* Normally, setting the skb 'protocol' field would be handled by a
- * call to eth_type_trans(), but it assumes there's a sending
- * device, which we may not have. */
- if (eth_proto_is_802_3(eth->h_proto))
- packet->protocol = eth->h_proto;
- else
- packet->protocol = htons(ETH_P_802_2);
-
/* Set packet's mru */
if (a[OVS_PACKET_ATTR_MRU]) {
mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
@@ -672,8 +658,7 @@ static const struct genl_ops dp_packet_genl_ops[] = {
}
};
-static struct genl_family dp_packet_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_packet_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_PACKET_FAMILY,
.version = OVS_PACKET_VERSION,
@@ -682,6 +667,7 @@ static struct genl_family dp_packet_genl_family = {
.parallel_ops = true,
.ops = dp_packet_genl_ops,
.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+ .module = THIS_MODULE,
};
static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
@@ -928,7 +914,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
- struct sw_flow_key key;
struct sw_flow_actions *acts;
struct sw_flow_match match;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
@@ -956,20 +941,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
/* Extract key. */
- ovs_match_init(&match, &key, &mask);
+ ovs_match_init(&match, &new_flow->key, false, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
- ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
-
/* Extract flow identifier. */
error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
- &key, log);
+ &new_flow->key, log);
if (error)
goto err_kfree_flow;
+ /* unmasked key is needed to match when ufid is not used. */
+ if (ovs_identifier_is_key(&new_flow->id))
+ match.key = new_flow->id.unmasked_key;
+
+ ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
+
/* Validate actions. */
error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
&new_flow->key, &acts, log);
@@ -996,7 +985,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
- flow = ovs_flow_tbl_lookup(&dp->table, &key);
+ flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
@@ -1121,7 +1110,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, &mask);
+ ovs_match_init(&match, &key, true, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
} else if (!ufid_present) {
@@ -1238,7 +1227,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
log);
} else if (!ufid_present) {
@@ -1297,7 +1286,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
NULL, log);
if (unlikely(err))
@@ -1434,8 +1423,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
},
};
-static struct genl_family dp_flow_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_flow_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
@@ -1446,6 +1434,7 @@ static struct genl_family dp_flow_genl_family = {
.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
static size_t ovs_dp_cmd_msg_size(void)
@@ -1820,8 +1809,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
},
};
-static struct genl_family dp_datapath_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_datapath_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
@@ -1832,6 +1820,7 @@ static struct genl_family dp_datapath_genl_family = {
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
/* Called with ovs_mutex or RCU read lock. */
@@ -2242,8 +2231,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
},
};
-struct genl_family dp_vport_genl_family = {
- .id = GENL_ID_GENERATE,
+struct genl_family dp_vport_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
@@ -2254,6 +2242,7 @@ struct genl_family dp_vport_genl_family = {
.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
static struct genl_family * const dp_genl_families[] = {
@@ -2271,7 +2260,7 @@ static void dp_unregister_genl(int n_families)
genl_unregister_family(dp_genl_families[i]);
}
-static int dp_register_genl(void)
+static int __init dp_register_genl(void)
{
int err;
int i;
@@ -2437,3 +2426,7 @@ module_exit(dp_cleanup);
MODULE_DESCRIPTION("Open vSwitch switching datapath");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ab85c1cae255..1c6e9377436d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,7 +144,7 @@ struct ovs_net {
bool xt_label;
};
-extern int ovs_net_id;
+extern unsigned int ovs_net_id;
void ovs_lock(void);
void ovs_unlock(void);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0ea128eeeab2..2c0a00f7f1b7 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
+#include <linux/cpumask.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
{
struct flow_stats *stats;
int node = numa_node_id();
+ int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- stats = rcu_dereference(flow->stats[node]);
+ stats = rcu_dereference(flow->stats[cpu]);
- /* Check if already have node-specific stats. */
+ /* Check if already have CPU-specific stats. */
if (likely(stats)) {
spin_lock(&stats->lock);
/* Mark if we write on the pre-allocated stats. */
- if (node == 0 && unlikely(flow->stats_last_writer != node))
- flow->stats_last_writer = node;
+ if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
+ flow->stats_last_writer = cpu;
} else {
stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
spin_lock(&stats->lock);
- /* If the current NUMA-node is the only writer on the
+ /* If the current CPU is the only writer on the
* pre-allocated stats keep using them.
*/
- if (unlikely(flow->stats_last_writer != node)) {
+ if (unlikely(flow->stats_last_writer != cpu)) {
/* A previous locker may have already allocated the
- * stats, so we need to check again. If node-specific
+ * stats, so we need to check again. If CPU-specific
* stats were already allocated, we update the pre-
* allocated stats as we have already locked them.
*/
- if (likely(flow->stats_last_writer != NUMA_NO_NODE)
- && likely(!rcu_access_pointer(flow->stats[node]))) {
- /* Try to allocate node-specific stats. */
+ if (likely(flow->stats_last_writer != -1) &&
+ likely(!rcu_access_pointer(flow->stats[cpu]))) {
+ /* Try to allocate CPU-specific stats. */
struct flow_stats *new_stats;
new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
new_stats->tcp_flags = tcp_flags;
spin_lock_init(&new_stats->lock);
- rcu_assign_pointer(flow->stats[node],
+ rcu_assign_pointer(flow->stats[cpu],
new_stats);
goto unlock;
}
}
- flow->stats_last_writer = node;
+ flow->stats_last_writer = cpu;
}
}
@@ -136,14 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
- int node;
+ int cpu;
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
- for_each_node(node) {
- struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+ /* We open code this to make sure cpu 0 is always considered */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+ struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
if (stats) {
/* Local CPU may write on non-local stats, so we must
@@ -163,10 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
/* Called with ovs_mutex. */
void ovs_flow_stats_clear(struct sw_flow *flow)
{
- int node;
+ int cpu;
- for_each_node(node) {
- struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+ /* We open code this to make sure cpu 0 is always considered */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+ struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
if (stats) {
spin_lock_bh(&stats->lock);
@@ -302,24 +306,74 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
sizeof(struct icmp6hdr));
}
-static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+/**
+ * Parse vlan tag from vlan header.
+ * Returns ERROR on memory error.
+ * Returns 0 if it encounters a non-vlan or incomplete packet.
+ * Returns 1 after successfully parsing vlan tag.
+ */
+static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
+ bool untag_vlan)
{
- struct qtag_prefix {
- __be16 eth_type; /* ETH_P_8021Q */
- __be16 tci;
- };
- struct qtag_prefix *qp;
+ struct vlan_head *vh = (struct vlan_head *)skb->data;
- if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
+ if (likely(!eth_type_vlan(vh->tpid)))
return 0;
- if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
- sizeof(__be16))))
+ if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16)))
+ return 0;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) +
+ sizeof(__be16))))
return -ENOMEM;
- qp = (struct qtag_prefix *) skb->data;
- key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
- __skb_pull(skb, sizeof(struct qtag_prefix));
+ vh = (struct vlan_head *)skb->data;
+ key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
+ key_vh->tpid = vh->tpid;
+
+ if (unlikely(untag_vlan)) {
+ int offset = skb->data - skb_mac_header(skb);
+ u16 tci;
+ int err;
+
+ __skb_push(skb, offset);
+ err = __skb_vlan_pop(skb, &tci);
+ __skb_pull(skb, offset);
+ if (err)
+ return err;
+ __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci);
+ } else {
+ __skb_pull(skb, sizeof(struct vlan_head));
+ }
+ return 1;
+}
+
+static void clear_vlan(struct sw_flow_key *key)
+{
+ key->eth.vlan.tci = 0;
+ key->eth.vlan.tpid = 0;
+ key->eth.cvlan.tci = 0;
+ key->eth.cvlan.tpid = 0;
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int res;
+
+ if (skb_vlan_tag_present(skb)) {
+ key->eth.vlan.tci = htons(skb->vlan_tci);
+ key->eth.vlan.tpid = skb->vlan_proto;
+ } else {
+ /* Parse outer vlan tag in the non-accelerated case. */
+ res = parse_vlan_tag(skb, &key->eth.vlan, true);
+ if (res <= 0)
+ return res;
+ }
+
+ /* Parse inner vlan tag. */
+ res = parse_vlan_tag(skb, &key->eth.cvlan, false);
+ if (res <= 0)
+ return res;
return 0;
}
@@ -446,17 +500,20 @@ invalid:
*
* Returns 0 if successful, otherwise a negative errno value.
*
- * Initializes @skb header pointers as follows:
+ * Initializes @skb header fields as follows:
*
- * - skb->mac_header: the Ethernet header.
+ * - skb->mac_header: the L2 header.
*
- * - skb->network_header: just past the Ethernet header, or just past the
- * VLAN header, to the first byte of the Ethernet payload.
+ * - skb->network_header: just past the L2 header, or just past the
+ * VLAN header, to the first byte of the L2 payload.
*
* - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
* on output, then just past the IP header, if one is present and
* of a correct length, otherwise the same as skb->network_header.
* For other key->eth.type values it is left untouched.
+ *
+ * - skb->protocol: the type of the data starting at skb->network_header.
+ * Equals to key->eth.type.
*/
static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
{
@@ -468,32 +525,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
skb_reset_mac_header(skb);
- /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
- * header in the linear data area.
- */
- eth = eth_hdr(skb);
- ether_addr_copy(key->eth.src, eth->h_source);
- ether_addr_copy(key->eth.dst, eth->h_dest);
+ /* Link layer. */
+ clear_vlan(key);
+ if (key->mac_proto == MAC_PROTO_NONE) {
+ if (unlikely(eth_type_vlan(skb->protocol)))
+ return -EINVAL;
- __skb_pull(skb, 2 * ETH_ALEN);
- /* We are going to push all headers that we pull, so no need to
- * update skb->csum here.
- */
+ skb_reset_network_header(skb);
+ } else {
+ eth = eth_hdr(skb);
+ ether_addr_copy(key->eth.src, eth->h_source);
+ ether_addr_copy(key->eth.dst, eth->h_dest);
+
+ __skb_pull(skb, 2 * ETH_ALEN);
+ /* We are going to push all headers that we pull, so no need to
+ * update skb->csum here.
+ */
- key->eth.tci = 0;
- if (skb_vlan_tag_present(skb))
- key->eth.tci = htons(skb->vlan_tci);
- else if (eth->h_proto == htons(ETH_P_8021Q))
if (unlikely(parse_vlan(skb, key)))
return -ENOMEM;
- key->eth.type = parse_ethertype(skb);
- if (unlikely(key->eth.type == htons(0)))
- return -ENOMEM;
+ skb->protocol = parse_ethertype(skb);
+ if (unlikely(skb->protocol == htons(0)))
+ return -ENOMEM;
- skb_reset_network_header(skb);
+ skb_reset_network_header(skb);
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+ }
skb_reset_mac_len(skb);
- __skb_push(skb, skb->data - skb_mac_header(skb));
+ key->eth.type = skb->protocol;
/* Network layer. */
if (key->eth.type == htons(ETH_P_IP)) {
@@ -600,12 +660,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
} else if (eth_p_mpls(key->eth.type)) {
size_t stack_len = MPLS_HLEN;
- /* In the presence of an MPLS label stack the end of the L2
- * header and the beginning of the L3 header differ.
- *
- * Advance network_header to the beginning of the L3
- * header. mac_len corresponds to the end of the L2 header.
- */
+ skb_set_inner_network_header(skb, skb->mac_len);
while (1) {
__be32 lse;
@@ -613,12 +668,12 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
if (unlikely(error))
return 0;
- memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+ memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN);
if (stack_len == MPLS_HLEN)
memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
- skb_set_network_header(skb, skb->mac_len + stack_len);
+ skb_set_inner_network_header(skb, skb->mac_len + stack_len);
if (lse & htonl(MPLS_LS_S_MASK))
break;
@@ -693,9 +748,25 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
return key_extract(skb, key);
}
+static int key_extract_mac_proto(struct sk_buff *skb)
+{
+ switch (skb->dev->type) {
+ case ARPHRD_ETHER:
+ return MAC_PROTO_ETHERNET;
+ case ARPHRD_NONE:
+ if (skb->protocol == htons(ETH_P_TEB))
+ return MAC_PROTO_ETHERNET;
+ return MAC_PROTO_NONE;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
+ int res;
+
/* Extract metadata from packet. */
if (tun_info) {
key->tun_proto = ip_tunnel_info_af(tun_info);
@@ -723,6 +794,10 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->phy.skb_mark = skb->mark;
ovs_ct_fill_key(skb, key);
key->ovs_flow_hash = 0;
+ res = key_extract_mac_proto(skb);
+ if (res < 0)
+ return res;
+ key->mac_proto = res;
key->recirc_id = 0;
return key_extract(skb, key);
@@ -734,12 +809,20 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
{
int err;
- memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
-
/* Extract metadata from netlink attributes. */
err = ovs_nla_get_flow_metadata(net, attr, key, log);
if (err)
return err;
+ /* key_extract assumes that skb->protocol is set-up for
+ * layer 3 packets which is the case for other callers,
+ * in particular packets received from the network stack.
+ * Here the correct value can be set from the metadata
+ * extracted above.
+ * For L2 packet key eth type would be zero. skb protocol
+ * would be set to correct value later during key-extact.
+ */
+
+ skb->protocol = key->eth.type;
return key_extract(skb, key);
}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 03378e75a67c..f61cae7f9030 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -37,6 +37,12 @@
struct sk_buff;
+enum sw_flow_mac_proto {
+ MAC_PROTO_NONE = 0,
+ MAC_PROTO_ETHERNET,
+};
+#define SW_FLOW_KEY_INVALID 0x80
+
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -50,6 +56,11 @@ struct ovs_tunnel_info {
struct metadata_dst *tun_dst;
};
+struct vlan_head {
+ __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/
+ __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+};
+
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
FIELD_SIZEOF(struct sw_flow_key, recirc_id))
@@ -63,13 +74,15 @@ struct sw_flow_key {
u32 skb_mark; /* SKB mark. */
u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
} __packed phy; /* Safe when right after 'tun_key'. */
+ u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */
u8 tun_proto; /* Protocol of encapsulating tunnel. */
u32 ovs_flow_hash; /* Datapath computed hash value. */
u32 recirc_id; /* Recirculation ID. */
struct {
u8 src[ETH_ALEN]; /* Ethernet source address. */
u8 dst[ETH_ALEN]; /* Ethernet destination address. */
- __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+ struct vlan_head vlan;
+ struct vlan_head cvlan;
__be16 type; /* Ethernet frame type. */
} eth;
union {
@@ -172,14 +185,14 @@ struct sw_flow {
struct hlist_node node[2];
u32 hash;
} flow_table, ufid_table;
- int stats_last_writer; /* NUMA-node id of the last writer on
+ int stats_last_writer; /* CPU id of the last writer on
* 'stats[0]'.
*/
struct sw_flow_key key;
struct sw_flow_id id;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
- struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
+ struct flow_stats __rcu *stats[]; /* One for each CPU. First one
* is allocated at flow creation time,
* the rest are allocated on demand
* while holding the 'stats[0].lock'.
@@ -200,6 +213,21 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
+static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key)
+{
+ return key->mac_proto & ~SW_FLOW_KEY_INVALID;
+}
+
+static inline u16 __ovs_mac_header_len(u8 mac_proto)
+{
+ return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0;
+}
+
+static inline u16 ovs_mac_header_len(const struct sw_flow_key *key)
+{
+ return __ovs_mac_header_len(ovs_key_mac_proto(key));
+}
+
static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid)
{
return sfid->ufid_len;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c78a6a1476fb..c87d359b9b37 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -123,7 +123,7 @@ static void update_range(struct sw_flow_match *match,
static bool match_validate(const struct sw_flow_match *match,
u64 key_attrs, u64 mask_attrs, bool log)
{
- u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 key_expected = 0;
u64 mask_allowed = key_attrs; /* At most allow all key attributes */
/* The following mask attributes allowed only if they
@@ -808,10 +808,194 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb,
ip_tunnel_info_af(tun_info));
}
+static int encode_vlan_from_nlattrs(struct sw_flow_match *match,
+ const struct nlattr *a[],
+ bool is_mask, bool inner)
+{
+ __be16 tci = 0;
+ __be16 tpid = 0;
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (likely(!inner)) {
+ SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask);
+ SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask);
+ SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask);
+ }
+ return 0;
+}
+
+static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
+ u64 key_attrs, bool inner,
+ const struct nlattr **a, bool log)
+{
+ __be16 tci = 0;
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+ eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) {
+ /* Not a VLAN. */
+ return 0;
+ }
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+ OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (tci) {
+ OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
+ /* Corner case for truncated VLAN header. */
+ OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+ }
+
+ return 1;
+}
+
+static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
+ u64 key_attrs, bool inner,
+ const struct nlattr **a, bool log)
+{
+ __be16 tci = 0;
+ __be16 tpid = 0;
+ bool encap_valid = !!(match->key->eth.vlan.tci &
+ htons(VLAN_TAG_PRESENT));
+ bool i_encap_valid = !!(match->key->eth.cvlan.tci &
+ htons(VLAN_TAG_PRESENT));
+
+ if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
+ /* Not a VLAN. */
+ return 0;
+ }
+
+ if ((!inner && !encap_valid) || (inner && !i_encap_valid)) {
+ OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (tpid != htons(0xffff)) {
+ OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).",
+ (inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
+ return -EINVAL;
+ }
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+
+ return 1;
+}
+
+static int __parse_vlan_from_nlattrs(struct sw_flow_match *match,
+ u64 *key_attrs, bool inner,
+ const struct nlattr **a, bool is_mask,
+ bool log)
+{
+ int err;
+ const struct nlattr *encap;
+
+ if (!is_mask)
+ err = validate_vlan_from_nlattrs(match, *key_attrs, inner,
+ a, log);
+ else
+ err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner,
+ a, log);
+ if (err <= 0)
+ return err;
+
+ err = encode_vlan_from_nlattrs(match, a, is_mask, inner);
+ if (err)
+ return err;
+
+ *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+
+ encap = a[OVS_KEY_ATTR_ENCAP];
+
+ if (!is_mask)
+ err = parse_flow_nlattrs(encap, a, key_attrs, log);
+ else
+ err = parse_flow_mask_nlattrs(encap, a, key_attrs, log);
+
+ return err;
+}
+
+static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
+ u64 *key_attrs, const struct nlattr **a,
+ bool is_mask, bool log)
+{
+ int err;
+ bool encap_valid = false;
+
+ err = __parse_vlan_from_nlattrs(match, key_attrs, false, a,
+ is_mask, log);
+ if (err)
+ return err;
+
+ encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT));
+ if (encap_valid) {
+ err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
+ is_mask, log);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int parse_eth_type_from_nlattrs(struct sw_flow_match *match,
+ u64 *attrs, const struct nlattr **a,
+ bool is_mask, bool log)
+{
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (!eth_proto_is_802_3(eth_type)) {
+ OVS_NLERR(log, "EtherType %x is less than min %x",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ return 0;
+}
+
static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
u64 *attrs, const struct nlattr **a,
bool is_mask, bool log)
{
+ u8 mac_proto = MAC_PROTO_ETHERNET;
+
if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
@@ -898,6 +1082,21 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
sizeof(*cl), is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
}
+
+ /* For layer 3 packets the Ethernet type is provided
+ * and treated as metadata but no MAC addresses are provided.
+ */
+ if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) &&
+ (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)))
+ mac_proto = MAC_PROTO_NONE;
+
+ /* Always exact match mac_proto */
+ SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask);
+
+ if (mac_proto == MAC_PROTO_NONE)
+ return parse_eth_type_from_nlattrs(match, attrs, a, is_mask,
+ log);
+
return 0;
}
@@ -920,42 +1119,26 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
SW_FLOW_KEY_MEMCPY(match, eth.dst,
eth_key->eth_dst, ETH_ALEN, is_mask);
attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
- __be16 tci;
-
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- if (is_mask)
- OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.");
- else
- OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set.");
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ /* VLAN attribute is always parsed before getting here since it
+ * may occur multiple times.
+ */
+ OVS_NLERR(log, "VLAN attribute unexpected.");
return -EINVAL;
}
- SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
- __be16 eth_type;
-
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (is_mask) {
- /* Always exact match EtherType. */
- eth_type = htons(0xffff);
- } else if (!eth_proto_is_802_3(eth_type)) {
- OVS_NLERR(log, "EtherType %x is less than min %x",
- ntohs(eth_type), ETH_P_802_3_MIN);
- return -EINVAL;
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask,
+ log);
+ if (err)
+ return err;
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
}
-
- SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ } else if (!match->key->eth.type) {
+ OVS_NLERR(log, "Either Ethernet header or EtherType is required.");
+ return -EINVAL;
}
if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
@@ -1182,49 +1365,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
bool log)
{
const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- const struct nlattr *encap;
struct nlattr *newmask = NULL;
u64 key_attrs = 0;
u64 mask_attrs = 0;
- bool encap_valid = false;
int err;
err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
if (err)
return err;
- if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
- (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
- __be16 tci;
-
- if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
- OVS_NLERR(log, "Invalid Vlan frame.");
- return -EINVAL;
- }
-
- key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- encap = a[OVS_KEY_ATTR_ENCAP];
- key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- encap_valid = true;
-
- if (tci & htons(VLAN_TAG_PRESENT)) {
- err = parse_flow_nlattrs(encap, a, &key_attrs, log);
- if (err)
- return err;
- } else if (!tci) {
- /* Corner case for truncated 802.1Q header. */
- if (nla_len(encap)) {
- OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute.");
- return -EINVAL;
- }
- } else {
- OVS_NLERR(log, "Encap attr is set for non-VLAN frame");
- return -EINVAL;
- }
- }
+ err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log);
+ if (err)
+ return err;
err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
if (err)
@@ -1265,46 +1417,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
goto free_newmask;
/* Always match on tci. */
- SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
-
- if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
- __be16 eth_type = 0;
- __be16 tci = 0;
-
- if (!encap_valid) {
- OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame.");
- err = -EINVAL;
- goto free_newmask;
- }
-
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- if (a[OVS_KEY_ATTR_ETHERTYPE])
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-
- if (eth_type == htons(0xffff)) {
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- encap = a[OVS_KEY_ATTR_ENCAP];
- err = parse_flow_mask_nlattrs(encap, a,
- &mask_attrs, log);
- if (err)
- goto free_newmask;
- } else {
- OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).",
- ntohs(eth_type));
- err = -EINVAL;
- goto free_newmask;
- }
-
- if (a[OVS_KEY_ATTR_VLAN])
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true);
+ SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).",
- ntohs(tci));
- err = -EINVAL;
- goto free_newmask;
- }
- }
+ err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log);
+ if (err)
+ goto free_newmask;
err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
log);
@@ -1410,12 +1528,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
}
+static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
+ bool is_mask)
+{
+ __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff);
+
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+ nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, bool is_mask,
struct sk_buff *skb)
{
struct ovs_key_ethernet *eth_key;
- struct nlattr *nla, *encap;
+ struct nlattr *nla;
+ struct nlattr *encap = NULL;
+ struct nlattr *in_encap = NULL;
if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
goto nla_put_failure;
@@ -1456,43 +1587,57 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (ovs_ct_put_key(output, skb))
goto nla_put_failure;
- nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
- if (!nla)
- goto nla_put_failure;
+ if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
+ goto nla_put_failure;
- eth_key = nla_data(nla);
- ether_addr_copy(eth_key->eth_src, output->eth.src);
- ether_addr_copy(eth_key->eth_dst, output->eth.dst);
+ eth_key = nla_data(nla);
+ ether_addr_copy(eth_key->eth_src, output->eth.src);
+ ether_addr_copy(eth_key->eth_dst, output->eth.dst);
- if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
- __be16 eth_type;
- eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
- nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
- goto nla_put_failure;
- encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.tci)
- goto unencap;
- } else
- encap = NULL;
-
- if (swkey->eth.type == htons(ETH_P_802_2)) {
- /*
- * Ethertype 802.2 is represented in the netlink with omitted
- * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
- * 0xffff in the mask attribute. Ethertype can also
- * be wildcarded.
- */
- if (is_mask && output->eth.type)
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
- output->eth.type))
+ if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
goto nla_put_failure;
- goto unencap;
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.vlan.tci)
+ goto unencap;
+
+ if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+ goto nla_put_failure;
+ in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.cvlan.tci)
+ goto unencap;
+ }
+ }
+
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
}
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
goto nla_put_failure;
+ if (eth_type_vlan(swkey->eth.type)) {
+ /* There are 3 VLAN tags, we don't know anything about the rest
+ * of the packet, so truncate here.
+ */
+ WARN_ON_ONCE(!(encap && in_encap));
+ goto unencap;
+ }
+
if (swkey->eth.type == htons(ETH_P_IP)) {
struct ovs_key_ipv4 *ipv4_key;
@@ -1619,6 +1764,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
}
unencap:
+ if (in_encap)
+ nla_nest_end(skb, in_encap);
if (encap)
nla_nest_end(skb, encap);
@@ -1882,13 +2029,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
void ovs_match_init(struct sw_flow_match *match,
struct sw_flow_key *key,
+ bool reset_key,
struct sw_flow_mask *mask)
{
memset(match, 0, sizeof(*match));
match->key = key;
match->mask = mask;
- memset(key, 0, sizeof(*key));
+ if (reset_key)
+ memset(key, 0, sizeof(*key));
if (mask) {
memset(&mask->key, 0, sizeof(mask->key));
@@ -1935,7 +2084,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
struct nlattr *a;
int err = 0, start, opts_type;
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
if (opts_type < 0)
return opts_type;
@@ -2010,8 +2159,8 @@ static bool validate_masked(u8 *data, int len)
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key,
- struct sw_flow_actions **sfa,
- bool *skip_copy, __be16 eth_type, bool masked, bool log)
+ struct sw_flow_actions **sfa, bool *skip_copy,
+ u8 mac_proto, __be16 eth_type, bool masked, bool log)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
@@ -2041,7 +2190,11 @@ static int validate_set(const struct nlattr *a,
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
+ break;
+
case OVS_KEY_ATTR_ETHERNET:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
break;
case OVS_KEY_ATTR_TUNNEL:
@@ -2208,6 +2361,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
int depth, struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log)
{
+ u8 mac_proto = ovs_key_mac_proto(key);
const struct nlattr *a;
int rem, err;
@@ -2230,6 +2384,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
[OVS_ACTION_ATTR_CT] = (u32)-1,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
+ [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
+ [OVS_ACTION_ATTR_POP_ETH] = 0,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2278,12 +2434,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
}
case OVS_ACTION_ATTR_POP_VLAN:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
vlan_tci = htons(0);
break;
case OVS_ACTION_ATTR_PUSH_VLAN:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
vlan = nla_data(a);
- if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ if (!eth_type_vlan(vlan->vlan_tpid))
return -EINVAL;
if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
return -EINVAL;
@@ -2331,14 +2491,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_SET:
err = validate_set(a, key, sfa,
- &skip_copy, eth_type, false, log);
+ &skip_copy, mac_proto, eth_type,
+ false, log);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SET_MASKED:
err = validate_set(a, key, sfa,
- &skip_copy, eth_type, true, log);
+ &skip_copy, mac_proto, eth_type,
+ true, log);
if (err)
return err;
break;
@@ -2358,6 +2520,22 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
skip_copy = true;
break;
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ /* Disallow pushing an Ethernet header if one
+ * is already present */
+ if (mac_proto != MAC_PROTO_NONE)
+ return -EINVAL;
+ mac_proto = MAC_PROTO_NONE;
+ break;
+
+ case OVS_ACTION_ATTR_POP_ETH:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
+ if (vlan_tci & htons(VLAN_TAG_PRESENT))
+ return -EINVAL;
+ mac_proto = MAC_PROTO_ETHERNET;
+ break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
@@ -2388,7 +2566,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
(*sfa)->orig_len = nla_len(attr);
err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
- key->eth.tci, log);
+ key->eth.vlan.tci, log);
if (err)
ovs_nla_free_flow_actions(*sfa);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 47dd142eca1c..45f9769e5aac 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void);
size_t ovs_key_attr_size(void);
void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key, struct sw_flow_mask *mask);
+ struct sw_flow_key *key, bool reset_key,
+ struct sw_flow_mask *mask);
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d073fff82fdb..ea7a8073fa02 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -32,6 +32,7 @@
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
+#include <linux/cpumask.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void)
{
struct sw_flow *flow;
struct flow_stats *stats;
- int node;
- flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
if (!flow)
return ERR_PTR(-ENOMEM);
- flow->sf_acts = NULL;
- flow->mask = NULL;
- flow->id.unmasked_key = NULL;
- flow->id.ufid_len = 0;
- flow->stats_last_writer = NUMA_NO_NODE;
+ flow->stats_last_writer = -1;
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void)
RCU_INIT_POINTER(flow->stats[0], stats);
- for_each_node(node)
- if (node != 0)
- RCU_INIT_POINTER(flow->stats[node], NULL);
-
return flow;
err:
kmem_cache_free(flow_cache, flow);
@@ -142,16 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
static void flow_free(struct sw_flow *flow)
{
- int node;
+ int cpu;
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
- for_each_node(node)
- if (flow->stats[node])
+ /* We open code this to make sure cpu 0 is always considered */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
+ if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
- (struct flow_stats __force *)flow->stats[node]);
+ (struct flow_stats __force *)flow->stats[cpu]);
kmem_cache_free(flow_cache, flow);
}
@@ -756,7 +749,7 @@ int ovs_flow_init(void)
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
- + (nr_node_ids
+ + (nr_cpu_ids
* sizeof(struct flow_stats *)),
0, 0, NULL);
if (flow_cache == NULL)
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 95c36147a6e1..d5d6caecd072 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -89,15 +89,6 @@ static const struct ethtool_ops internal_dev_ethtool_ops = {
.get_link = ethtool_op_get_link,
};
-static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
-{
- if (new_mtu < 68)
- return -EINVAL;
-
- netdev->mtu = new_mtu;
- return 0;
-}
-
static void internal_dev_destructor(struct net_device *dev)
{
struct vport *vport = ovs_internal_dev_get_vport(dev);
@@ -148,7 +139,6 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
- .ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_get_stats,
.ndo_set_rx_headroom = internal_set_rx_headroom,
};
@@ -176,7 +166,7 @@ static void do_setup(struct net_device *netdev)
netdev->vlan_features = netdev->features;
netdev->hw_enc_features = netdev->features;
- netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
+ netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
eth_hw_addr_random(netdev);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 4e3972344aa6..0389398fa4ab 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -57,8 +57,10 @@ static void netdev_port_receive(struct sk_buff *skb)
if (unlikely(!skb))
return;
- skb_push(skb, ETH_HLEN);
- skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ if (skb->dev->type == ARPHRD_ETHER) {
+ skb_push(skb, ETH_HLEN);
+ skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ }
ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
return;
error:
@@ -97,7 +99,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
}
if (vport->dev->flags & IFF_LOOPBACK ||
- vport->dev->type != ARPHRD_ETHER ||
+ (vport->dev->type != ARPHRD_ETHER &&
+ vport->dev->type != ARPHRD_NONE) ||
ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
@@ -162,7 +165,6 @@ void ovs_netdev_detach_dev(struct vport *vport)
netdev_master_upper_dev_get(vport->dev));
dev_set_promiscuity(vport->dev, -1);
}
-EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
static void netdev_destroy(struct vport *vport)
{
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6b21fd068d87..b6c8524032a0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -463,42 +463,52 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
ovs_dp_process_packet(skb, &key);
return 0;
}
-EXPORT_SYMBOL_GPL(ovs_vport_receive);
-static void free_vport_rcu(struct rcu_head *rcu)
+static unsigned int packet_length(const struct sk_buff *skb,
+ struct net_device *dev)
{
- struct vport *vport = container_of(rcu, struct vport, rcu);
+ unsigned int length = skb->len - dev->hard_header_len;
- ovs_vport_free(vport);
-}
-
-void ovs_vport_deferred_free(struct vport *vport)
-{
- if (!vport)
- return;
-
- call_rcu(&vport->rcu, free_vport_rcu);
-}
-EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-
-static unsigned int packet_length(const struct sk_buff *skb)
-{
- unsigned int length = skb->len - ETH_HLEN;
-
- if (skb->protocol == htons(ETH_P_8021Q))
+ if (!skb_vlan_tag_present(skb) &&
+ eth_type_vlan(skb->protocol))
length -= VLAN_HLEN;
+ /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow
+ * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none
+ * account for 802.1ad. e.g. is_skb_forwardable().
+ */
+
return length;
}
-void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
{
int mtu = vport->dev->mtu;
- if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ switch (vport->dev->type) {
+ case ARPHRD_NONE:
+ if (mac_proto == MAC_PROTO_ETHERNET) {
+ skb_reset_network_header(skb);
+ skb_reset_mac_len(skb);
+ skb->protocol = htons(ETH_P_TEB);
+ } else if (mac_proto != MAC_PROTO_NONE) {
+ WARN_ON_ONCE(1);
+ goto drop;
+ }
+ break;
+ case ARPHRD_ETHER:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ goto drop;
+ break;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(packet_length(skb, vport->dev) > mtu &&
+ !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
vport->dev->name,
- packet_length(skb), mtu);
+ packet_length(skb, vport->dev), mtu);
vport->dev->stats.tx_errors++;
goto drop;
}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index f01f28a567ad..cda66c26ad08 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -149,7 +149,6 @@ struct vport_ops {
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
-void ovs_vport_deferred_free(struct vport *vport);
#define VPORT_ALIGN 8
@@ -198,6 +197,6 @@ int __ovs_vport_ops_register(struct vport_ops *ops);
})
void ovs_vport_ops_unregister(struct vport_ops *ops);
-void ovs_vport_send(struct vport *vport, struct sk_buff *skb);
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto);
#endif /* vport.h */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 33a4697d5539..70f5b6a4683c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -73,7 +73,7 @@
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
@@ -250,7 +250,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
static int packet_direct_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- netdev_features_t features;
+ struct sk_buff *orig_skb = skb;
struct netdev_queue *txq;
int ret = NETDEV_TX_BUSY;
@@ -258,9 +258,8 @@ static int packet_direct_xmit(struct sk_buff *skb)
!netif_carrier_ok(dev)))
goto drop;
- features = netif_skb_features(skb);
- if (skb_needs_linearize(skb, features) &&
- __skb_linearize(skb))
+ skb = validate_xmit_skb_list(skb, dev);
+ if (skb != orig_skb)
goto drop;
txq = skb_get_tx_queue(dev, skb);
@@ -280,7 +279,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
return ret;
drop:
atomic_long_inc(&dev->tx_dropped);
- kfree_skb(skb);
+ kfree_skb_list(skb);
return NET_XMIT_DROP;
}
@@ -1498,6 +1497,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
f->arr[f->num_members] = sk;
smp_wmb();
f->num_members++;
+ if (f->num_members == 1)
+ dev_add_pack(&f->prot_hook);
spin_unlock(&f->lock);
}
@@ -1514,6 +1515,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
BUG_ON(i >= f->num_members);
f->arr[i] = f->arr[f->num_members - 1];
f->num_members--;
+ if (f->num_members == 0)
+ __dev_remove_pack(&f->prot_hook);
spin_unlock(&f->lock);
}
@@ -1620,6 +1623,7 @@ static void fanout_release_data(struct packet_fanout *f)
static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
{
+ struct packet_rollover *rollover = NULL;
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f, *match;
u8 type = type_flags & 0xff;
@@ -1642,23 +1646,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
return -EINVAL;
}
+ mutex_lock(&fanout_mutex);
+
+ err = -EINVAL;
if (!po->running)
- return -EINVAL;
+ goto out;
+ err = -EALREADY;
if (po->fanout)
- return -EALREADY;
+ goto out;
if (type == PACKET_FANOUT_ROLLOVER ||
(type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
- po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
- if (!po->rollover)
- return -ENOMEM;
- atomic_long_set(&po->rollover->num, 0);
- atomic_long_set(&po->rollover->num_huge, 0);
- atomic_long_set(&po->rollover->num_failed, 0);
+ err = -ENOMEM;
+ rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
+ if (!rollover)
+ goto out;
+ atomic_long_set(&rollover->num, 0);
+ atomic_long_set(&rollover->num_huge, 0);
+ atomic_long_set(&rollover->num_failed, 0);
+ po->rollover = rollover;
}
- mutex_lock(&fanout_mutex);
match = NULL;
list_for_each_entry(f, &fanout_list, list) {
if (f->id == id &&
@@ -1688,7 +1697,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.func = packet_rcv_fanout;
match->prot_hook.af_packet_priv = match;
match->prot_hook.id_match = match_fanout_group;
- dev_add_pack(&match->prot_hook);
list_add(&match->list, &fanout_list);
}
err = -EINVAL;
@@ -1705,36 +1713,40 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
}
out:
- mutex_unlock(&fanout_mutex);
- if (err) {
- kfree(po->rollover);
+ if (err && rollover) {
+ kfree(rollover);
po->rollover = NULL;
}
+ mutex_unlock(&fanout_mutex);
return err;
}
-static void fanout_release(struct sock *sk)
+/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
+ * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
+ * It is the responsibility of the caller to call fanout_release_data() and
+ * free the returned packet_fanout (after synchronize_net())
+ */
+static struct packet_fanout *fanout_release(struct sock *sk)
{
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f;
+ mutex_lock(&fanout_mutex);
f = po->fanout;
- if (!f)
- return;
+ if (f) {
+ po->fanout = NULL;
- mutex_lock(&fanout_mutex);
- po->fanout = NULL;
+ if (atomic_dec_and_test(&f->sk_ref))
+ list_del(&f->list);
+ else
+ f = NULL;
- if (atomic_dec_and_test(&f->sk_ref)) {
- list_del(&f->list);
- dev_remove_pack(&f->prot_hook);
- fanout_release_data(f);
- kfree(f);
+ if (po->rollover)
+ kfree_rcu(po->rollover, rcu);
}
mutex_unlock(&fanout_mutex);
- if (po->rollover)
- kfree_rcu(po->rollover, rcu);
+ return f;
}
static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
@@ -1968,17 +1980,6 @@ static unsigned int run_filter(struct sk_buff *skb,
return res;
}
-static int __packet_rcv_vnet(const struct sk_buff *skb,
- struct virtio_net_hdr *vnet_hdr)
-{
- *vnet_hdr = (const struct virtio_net_hdr) { 0 };
-
- if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
- BUG();
-
- return 0;
-}
-
static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
size_t *len)
{
@@ -1988,7 +1989,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
return -EINVAL;
*len -= sizeof(vnet_hdr);
- if (__packet_rcv_vnet(skb, &vnet_hdr))
+ if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
return -EINVAL;
return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2247,8 +2248,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
spin_unlock(&sk->sk_receive_queue.lock);
if (po->has_vnet_hdr) {
- if (__packet_rcv_vnet(skb, h.raw + macoff -
- sizeof(struct virtio_net_hdr))) {
+ if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
+ sizeof(struct virtio_net_hdr),
+ vio_le(), true)) {
spin_lock(&sk->sk_receive_queue.lock);
goto drop_n_account;
}
@@ -2391,8 +2393,6 @@ static void tpacket_set_protocol(const struct net_device *dev,
static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
{
- unsigned short gso_type = 0;
-
if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
(__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
@@ -2404,69 +2404,22 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
return -EINVAL;
- if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
- switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
- case VIRTIO_NET_HDR_GSO_TCPV4:
- gso_type = SKB_GSO_TCPV4;
- break;
- case VIRTIO_NET_HDR_GSO_TCPV6:
- gso_type = SKB_GSO_TCPV6;
- break;
- case VIRTIO_NET_HDR_GSO_UDP:
- gso_type = SKB_GSO_UDP;
- break;
- default:
- return -EINVAL;
- }
-
- if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
- gso_type |= SKB_GSO_TCP_ECN;
-
- if (vnet_hdr->gso_size == 0)
- return -EINVAL;
- }
-
- vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
return 0;
}
static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
struct virtio_net_hdr *vnet_hdr)
{
- int n;
-
if (*len < sizeof(*vnet_hdr))
return -EINVAL;
*len -= sizeof(*vnet_hdr);
- n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
- if (n != sizeof(*vnet_hdr))
+ if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
return -EFAULT;
return __packet_snd_vnet_parse(vnet_hdr, *len);
}
-static int packet_snd_vnet_gso(struct sk_buff *skb,
- struct virtio_net_hdr *vnet_hdr)
-{
- if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
- u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
-
- if (!skb_partial_csum_set(skb, s, o))
- return -EINVAL;
- }
-
- skb_shinfo(skb)->gso_size =
- __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
- skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
-
- /* Header must be checked, and gso_segs computed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
- return 0;
-}
-
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
void *frame, struct net_device *dev, void *data, int tp_len,
__be16 proto, unsigned char *addr, int hlen, int copylen,
@@ -2726,7 +2679,8 @@ tpacket_error:
}
}
- if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
+ if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
+ vio_le())) {
tp_len = -EINVAL;
goto tpacket_error;
}
@@ -2814,7 +2768,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
struct virtio_net_hdr vnet_hdr = { 0 };
int offset = 0;
struct packet_sock *po = pkt_sk(sk);
- int hlen, tlen;
+ int hlen, tlen, linear;
int extra_len = 0;
/*
@@ -2875,8 +2829,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
err = -ENOBUFS;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
- skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
- __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
+ linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
+ linear = max(linear, min_t(int, len, dev->hard_header_len));
+ skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
msg->msg_flags & MSG_DONTWAIT, &err);
if (skb == NULL)
goto out_unlock;
@@ -2917,7 +2872,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
packet_pick_tx_queue(dev, skb);
if (po->has_vnet_hdr) {
- err = packet_snd_vnet_gso(skb, &vnet_hdr);
+ err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
if (err)
goto out_free;
len += sizeof(vnet_hdr);
@@ -2965,6 +2920,7 @@ static int packet_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct packet_sock *po;
+ struct packet_fanout *f;
struct net *net;
union tpacket_req_u req_u;
@@ -3004,9 +2960,14 @@ static int packet_release(struct socket *sock)
packet_set_ring(sk, &req_u, 1, 1);
}
- fanout_release(sk);
+ f = fanout_release(sk);
synchronize_net();
+
+ if (f) {
+ fanout_release_data(f);
+ kfree(f);
+ }
/*
* Now the socket is dead. No more input will appear.
*/
@@ -3649,19 +3610,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
switch (val) {
case TPACKET_V1:
case TPACKET_V2:
case TPACKET_V3:
- po->tp_version = val;
- return 0;
+ break;
default:
return -EINVAL;
}
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->tp_version = val;
+ ret = 0;
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_RESERVE:
{
@@ -4164,6 +4131,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
+ lock_sock(sk);
/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
net_warn_ratelimited("Tx-ring is not supported.\n");
@@ -4245,7 +4213,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
goto out;
}
- lock_sock(sk);
/* Detach socket from network */
spin_lock(&po->bind_lock);
@@ -4294,11 +4261,11 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (!tx_ring)
prb_shutdown_retire_blk_timer(po, rb_queue);
}
- release_sock(sk);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
+ release_sock(sk);
return err;
}
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index fa8237fdc57b..21c28b51be94 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -217,20 +217,10 @@ static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
-static int gprs_set_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
- return -EINVAL;
-
- dev->mtu = new_mtu;
- return 0;
-}
-
static const struct net_device_ops gprs_netdev_ops = {
.ndo_open = gprs_open,
.ndo_stop = gprs_close,
.ndo_start_xmit = gprs_xmit,
- .ndo_change_mtu = gprs_set_mtu,
};
static void gprs_setup(struct net_device *dev)
@@ -239,6 +229,8 @@ static void gprs_setup(struct net_device *dev)
dev->type = ARPHRD_PHONET_PIPE;
dev->flags = IFF_POINTOPOINT | IFF_NOARP;
dev->mtu = GPRS_DEFAULT_MTU;
+ dev->min_mtu = 576;
+ dev->max_mtu = (PHONET_MAX_MTU - 11);
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->tx_queue_len = 10;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 850a86cde0b3..8bad5624a27a 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1167,7 +1167,7 @@ disabled:
/* Wait until flow control allows TX */
done = atomic_read(&pn->tx_credits);
while (!done) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (!timeo) {
err = -EAGAIN;
@@ -1178,10 +1178,9 @@ disabled:
goto out;
}
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits));
- finish_wait(sk_sleep(sk), &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
if (sk->sk_state != TCP_ESTABLISHED)
goto disabled;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index a58680016472..2cb4c5dfad6f 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -44,7 +44,7 @@ struct phonet_net {
struct phonet_routes routes;
};
-static int phonet_net_id __read_mostly;
+static unsigned int phonet_net_id __read_mostly;
static struct phonet_net *phonet_pernet(struct net *net)
{
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index c985ecbe9bd6..ae5ac175b2be 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
const int pkt_len = 20;
struct qrtr_hdr *hdr;
struct sk_buff *skb;
- u32 *buf;
+ __le32 *buf;
skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL);
if (!skb)
@@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
hdr->dst_node_id = cpu_to_le32(dst_node);
hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
- buf = (u32 *)skb_put(skb, pkt_len);
+ buf = (__le32 *)skb_put(skb, pkt_len);
memset(buf, 0, pkt_len);
buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX);
buf[1] = cpu_to_le32(src_node);
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 0e72bec1529f..56c7d27eefee 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -13,5 +13,5 @@ obj-$(CONFIG_RDS_TCP) += rds_tcp.o
rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
tcp_send.o tcp_stats.o
-ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG
+ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6beaeb1138f3..2ac1e6194be3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -605,10 +605,14 @@ static void rds_exit(void)
}
module_exit(rds_exit);
+u32 rds_gen_num;
+
static int rds_init(void)
{
int ret;
+ net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
+
ret = rds_bind_lock_init();
if (ret)
goto out;
diff --git a/net/rds/connection.c b/net/rds/connection.c
index f5058559bb08..fe9d31c0b22d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
+ conn->c_my_gen_num = rds_gen_num;
+ conn->c_peer_gen_num = 0;
hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
@@ -681,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
void rds_conn_connect_if_down(struct rds_connection *conn)
{
@@ -689,21 +692,6 @@ void rds_conn_connect_if_down(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
-/*
- * An error occurred on the connection
- */
-void
-__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- vprintk(fmt, ap);
- va_end(ap);
-
- rds_conn_drop(conn);
-}
-
void
__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
{
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 7eaf887e46f8..5680d90b0b77 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -160,7 +160,7 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
rds_ibdev->dev = device;
- rds_ibdev->pd = ib_alloc_pd(device);
+ rds_ibdev->pd = ib_alloc_pd(device, 0);
if (IS_ERR(rds_ibdev->pd)) {
rds_ibdev->pd = NULL;
goto put_dev;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 046f7508c06b..45ac8e8e58f4 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -333,6 +333,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp);
void rds_ib_state_change(struct sock *sk);
int rds_ib_listen_init(void);
void rds_ib_listen_stop(void);
+__printf(2, 3)
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
diff --git a/net/rds/message.c b/net/rds/message.c
index 6cb91061556a..49bfb512d808 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_NPATHS] = sizeof(u16),
+[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
};
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4c93badeabf2..ea961144084f 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -135,7 +135,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
/* Release any MRs associated with this socket */
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) {
- mr = container_of(node, struct rds_mr, r_rb_node);
+ mr = rb_entry(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0;
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 345f09059e9f..d5f311767157 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -100,11 +100,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
trans->cm_connect_complete(conn, event);
break;
+ case RDMA_CM_EVENT_REJECTED:
+ rdsdebug("Connection rejected: %s\n",
+ rdma_reject_msg(cm_id, event->status));
+ /* FALLTHROUGH */
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
- case RDMA_CM_EVENT_REJECTED:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_ADDR_CHANGE:
if (conn)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index b2d17f0fafa8..ebbf909b87ec 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -33,7 +33,7 @@
#define KERNEL_HAS_ATOMIC64
#endif
-#ifdef DEBUG
+#ifdef RDS_DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
/* sigh, pr_debug() causes unused variable warnings */
@@ -151,6 +151,9 @@ struct rds_connection {
struct rds_conn_path c_path[RDS_MPATH_WORKERS];
wait_queue_head_t c_hs_waitq; /* handshake waitq */
+
+ u32 c_my_gen_num;
+ u32 c_peer_gen_num;
};
static inline
@@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest {
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
-#define RDS_EXTHDR_NPATHS 4
+#define RDS_EXTHDR_NPATHS 5
+#define RDS_EXTHDR_GEN_NUM 6
#define __RDS_EXTHDR_MAX 16 /* for now */
@@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
#define RDS_MSG_RETRANSMITTED 5
#define RDS_MSG_MAPPED 6
#define RDS_MSG_PAGEVEC 7
+#define RDS_MSG_FLUSH 8
struct rds_message {
atomic_t m_refcount;
@@ -664,6 +669,7 @@ void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
+extern u32 rds_gen_num;
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net,
@@ -683,11 +689,8 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_lengths *lens,
int (*visitor)(struct rds_connection *, void *),
size_t item_len);
-__printf(2, 3)
-void __rds_conn_error(struct rds_connection *conn, const char *, ...);
-#define rds_conn_error(conn, fmt...) \
- __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+__printf(2, 3)
void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
#define rds_conn_path_error(cp, fmt...) \
__rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index cbfabdf3ff48..9d0666e5fe35 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
/* do nothing if no change in cong state */
}
+static void rds_conn_peer_gen_update(struct rds_connection *conn,
+ u32 peer_gen_num)
+{
+ int i;
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
+ if (peer_gen_num != 0) {
+ if (conn->c_peer_gen_num != 0 &&
+ peer_gen_num != conn->c_peer_gen_num) {
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ struct rds_conn_path *cp;
+
+ cp = &conn->c_path[i];
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ cp->cp_next_tx_seq = 1;
+ cp->cp_next_rx_seq = 0;
+ list_for_each_entry_safe(rm, tmp,
+ &cp->cp_retrans,
+ m_conn_item) {
+ set_bit(RDS_MSG_FLUSH, &rm->m_flags);
+ }
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ }
+ }
+ conn->c_peer_gen_num = peer_gen_num;
+ }
+}
+
/*
* Process all extension headers that come with this message.
*/
@@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
union {
struct rds_ext_header_version version;
u16 rds_npaths;
+ u32 rds_gen_num;
} buffer;
+ u32 new_peer_gen_num = 0;
while (1) {
len = sizeof(buffer);
@@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
buffer.rds_npaths);
break;
+ case RDS_EXTHDR_GEN_NUM:
+ new_peer_gen_num = buffer.rds_gen_num;
+ break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
@@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
}
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
+ rds_conn_peer_gen_update(conn, new_peer_gen_num);
}
/* rds_start_mprds() will synchronously start multiple paths when appropriate.
diff --git a/net/rds/send.c b/net/rds/send.c
index 896626b9a0ef..77c8c6e613ad 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -259,8 +259,9 @@ restart:
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
- if (rm->rdma.op_active &&
- test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
+ (rm->rdma.op_active &&
+ test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
spin_lock_irqsave(&cp->cp_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
@@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
sizeof(npaths));
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_GEN_NUM,
+ &cp->cp_conn->c_my_gen_num,
+ sizeof(u32));
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index fcddacc92e01..57bb52361e0f 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -220,7 +220,7 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
write_unlock_bh(&sock->sk->sk_callback_lock);
}
-static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
+static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
@@ -229,6 +229,7 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
unsigned long flags;
struct sockaddr_in sin;
int sinlen;
+ struct socket *sock;
spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
@@ -237,12 +238,17 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
- sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
- tsinfo.local_addr = sin.sin_addr.s_addr;
- tsinfo.local_port = sin.sin_port;
- sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
- tsinfo.peer_addr = sin.sin_addr.s_addr;
- tsinfo.peer_port = sin.sin_port;
+ sock = tc->t_sock;
+ if (sock) {
+ sock->ops->getname(sock, (struct sockaddr *)&sin,
+ &sinlen, 0);
+ tsinfo.local_addr = sin.sin_addr.s_addr;
+ tsinfo.local_port = sin.sin_port;
+ sock->ops->getname(sock, (struct sockaddr *)&sin,
+ &sinlen, 1);
+ tsinfo.peer_addr = sin.sin_addr.s_addr;
+ tsinfo.peer_port = sin.sin_port;
+ }
tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -360,7 +366,7 @@ struct rds_transport rds_tcp_transport = {
.t_mp_capable = 1,
};
-static int rds_tcp_netid;
+static unsigned int rds_tcp_netid;
/* per-network namespace private data for this module */
struct rds_tcp_net {
@@ -659,6 +665,8 @@ out_recv:
out_pernet:
unregister_pernet_subsys(&rds_tcp_net_ops);
out_slab:
+ if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
+ pr_warn("could not unregister rds_tcp_dev_notifier\n");
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 05f61c533ed3..d6839d96d539 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV:
break;
case TCP_ESTABLISHED:
- rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ /* Force the peer to reconnect so that we have the
+ * TCP ports going from <smaller-ip>.<transient> to
+ * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
+ * RDS connection as RDS_CONN_UP until the reconnect,
+ * to avoid RDS datagram loss.
+ */
+ if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
+ rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+ RDS_CONN_ERROR)) {
+ rds_conn_path_drop(cp);
+ } else {
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ }
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index e0b23fb5b8d5..f74bab3ecdca 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -83,27 +83,22 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{
int i;
bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
- int npaths = conn->c_npaths;
-
- if (npaths <= 1) {
- struct rds_conn_path *cp = &conn->c_path[0];
- int ret;
-
- ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
- RDS_CONN_CONNECTING);
- if (!ret)
- rds_conn_path_transition(cp, RDS_CONN_ERROR,
- RDS_CONN_CONNECTING);
- return cp->cp_transport_data;
- }
+ int npaths = max_t(int, 1, conn->c_npaths);
- /* for mprds, paths with cp_index > 0 MUST be initiated by the peer
+ /* for mprds, all paths MUST be initiated by the peer
* with the smaller address.
*/
- if (!peer_is_smaller)
+ if (!peer_is_smaller) {
+ /* Make sure we initiate at least one path if this
+ * has not already been done; rds_start_mprds() will
+ * take care of additional paths, if necessary.
+ */
+ if (npaths == 1)
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
return NULL;
+ }
- for (i = 1; i < npaths; i++) {
+ for (i = 0; i < npaths; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
@@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock)
mutex_lock(&rs_tcp->t_conn_path_lock);
cp = rs_tcp->t_cpath;
conn_state = rds_conn_path_state(cp);
- if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP &&
- conn_state != RDS_CONN_ERROR)
+ WARN_ON(conn_state == RDS_CONN_UP);
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
goto rst_nsk;
if (rs_tcp->t_sock) {
/* Need to resolve a duelling SYN between peers.
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 89d09b481f47..dcf4742083ea 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
tc->t_last_expected_una = rm->m_ack_seq + 1;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
rm, rds_tcp_snd_nxt(tc),
(unsigned long long)rm->m_ack_seq);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index e42df11bf30a..e36e333a0aa0 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -171,8 +171,7 @@ void rds_connect_worker(struct work_struct *work)
RDS_CONN_DOWN))
rds_queue_reconnect(cp);
else
- rds_conn_path_error(cp,
- "RDS: connect failed\n");
+ rds_conn_path_error(cp, "connect failed\n");
}
}
}
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 129d357d2722..9ad301c46b88 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -34,7 +34,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 0fc76d845103..452bbb38d943 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 784c53163b7b..86f8853a038c 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -19,6 +19,20 @@ config AF_RXRPC
See Documentation/networking/rxrpc.txt.
+config AF_RXRPC_IPV6
+ bool "IPv6 support for RxRPC"
+ depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC)
+ help
+ Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as
+ its network transport.
+
+config AF_RXRPC_INJECT_LOSS
+ bool "Inject packet loss into RxRPC packet stream"
+ depends on AF_RXRPC
+ help
+ Say Y here to inject packet loss by discarding some received and some
+ transmitted packets.
+
config AF_RXRPC_DEBUG
bool "RxRPC dynamic debugging"
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 10f3f48a16a8..8fc6ea347182 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -22,6 +22,7 @@ af-rxrpc-y := \
peer_object.o \
recvmsg.o \
security.o \
+ sendmsg.o \
skbuff.o \
utils.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 88effadd4b16..5f63f6dcaabb 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -16,12 +16,14 @@
#include <linux/net.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
+#include <linux/random.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/key-type.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
+#define CREATE_TRACE_POINTS
#include "ar-internal.h"
MODULE_DESCRIPTION("RxRPC network protocol");
@@ -43,7 +45,7 @@ u32 rxrpc_epoch;
atomic_t rxrpc_debug_id;
/* count of skbs currently in use */
-atomic_t rxrpc_n_skbs;
+atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
struct workqueue_struct *rxrpc_workqueue;
@@ -104,19 +106,25 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
case AF_INET:
if (srx->transport_len < sizeof(struct sockaddr_in))
return -EINVAL;
- _debug("INET: %x @ %pI4",
- ntohs(srx->transport.sin.sin_port),
- &srx->transport.sin.sin_addr);
tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
case AF_INET6:
+ if (srx->transport_len < sizeof(struct sockaddr_in6))
+ return -EINVAL;
+ tail = offsetof(struct sockaddr_rxrpc, transport) +
+ sizeof(struct sockaddr_in6);
+ break;
+#endif
+
default:
return -EAFNOSUPPORT;
}
if (tail < len)
memset((void *)srx + tail, 0, len - tail);
+ _debug("INET: %pISp", &srx->transport);
return 0;
}
@@ -128,7 +136,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
struct sock *sk = sock->sk;
struct rxrpc_local *local;
- struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ u16 service_id = srx->srx_service;
int ret;
_enter("%p,%p,%d", rx, saddr, len);
@@ -152,16 +161,13 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
goto error_unlock;
}
- if (rx->srx.srx_service) {
- write_lock_bh(&local->services_lock);
- list_for_each_entry(prx, &local->services, listen_link) {
- if (prx->srx.srx_service == rx->srx.srx_service)
- goto service_in_use;
- }
-
+ if (service_id) {
+ write_lock(&local->services_lock);
+ if (rcu_access_pointer(local->service))
+ goto service_in_use;
rx->local = local;
- list_add_tail(&rx->listen_link, &local->services);
- write_unlock_bh(&local->services_lock);
+ rcu_assign_pointer(local->service, rx);
+ write_unlock(&local->services_lock);
rx->sk.sk_state = RXRPC_SERVER_BOUND;
} else {
@@ -174,7 +180,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
return 0;
service_in_use:
- write_unlock_bh(&local->services_lock);
+ write_unlock(&local->services_lock);
rxrpc_put_local(local);
ret = -EADDRINUSE;
error_unlock:
@@ -191,7 +197,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
struct rxrpc_sock *rx = rxrpc_sk(sk);
- unsigned int max;
+ unsigned int max, old;
int ret;
_enter("%p,%d", rx, backlog);
@@ -210,9 +216,13 @@ static int rxrpc_listen(struct socket *sock, int backlog)
backlog = max;
else if (backlog < 0 || backlog > max)
break;
+ old = sk->sk_max_ack_backlog;
sk->sk_max_ack_backlog = backlog;
- rx->sk.sk_state = RXRPC_SERVER_LISTENING;
- ret = 0;
+ ret = rxrpc_service_prealloc(rx, GFP_KERNEL);
+ if (ret == 0)
+ rx->sk.sk_state = RXRPC_SERVER_LISTENING;
+ else
+ sk->sk_max_ack_backlog = old;
break;
default:
ret = -EBUSY;
@@ -230,6 +240,8 @@ static int rxrpc_listen(struct socket *sock, int backlog)
* @srx: The address of the peer to contact
* @key: The security context to use (defaults to socket setting)
* @user_call_ID: The ID to use
+ * @gfp: The allocation constraints
+ * @notify_rx: Where to send notifications instead of socket queue
*
* Allow a kernel service to begin a call on the nominated socket. This just
* sets up all the internal tracking structures and allocates connection and
@@ -242,7 +254,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
struct sockaddr_rxrpc *srx,
struct key *key,
unsigned long user_call_ID,
- gfp_t gfp)
+ gfp_t gfp,
+ rxrpc_notify_rx_t notify_rx)
{
struct rxrpc_conn_parameters cp;
struct rxrpc_call *call;
@@ -269,6 +282,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
cp.exclusive = false;
cp.service_id = srx->srx_service;
call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
+ if (!IS_ERR(call))
+ call->notify_rx = notify_rx;
release_sock(&rx->sk);
_leave(" = %p", call);
@@ -278,40 +293,39 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
/**
* rxrpc_kernel_end_call - Allow a kernel service to end a call it was using
+ * @sock: The socket the call is on
* @call: The call to end
*
* Allow a kernel service to end a call it was using. The call must be
* complete before this is called (the call should be aborted if necessary).
*/
-void rxrpc_kernel_end_call(struct rxrpc_call *call)
+void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
{
_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
- rxrpc_remove_user_ID(call->socket, call);
- rxrpc_put_call(call);
+ rxrpc_release_call(rxrpc_sk(sock->sk), call);
+ rxrpc_put_call(call, rxrpc_call_put_kernel);
}
EXPORT_SYMBOL(rxrpc_kernel_end_call);
/**
- * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages
+ * rxrpc_kernel_new_call_notification - Get notifications of new calls
* @sock: The socket to intercept received messages on
- * @interceptor: The function to pass the messages to
+ * @notify_new_call: Function to be called when new calls appear
+ * @discard_new_call: Function to discard preallocated calls
*
- * Allow a kernel service to intercept messages heading for the Rx queue on an
- * RxRPC socket. They get passed to the specified function instead.
- * @interceptor should free the socket buffers it is given. @interceptor is
- * called with the socket receive queue spinlock held and softirqs disabled -
- * this ensures that the messages will be delivered in the right order.
+ * Allow a kernel service to be given notifications about new calls.
*/
-void rxrpc_kernel_intercept_rx_messages(struct socket *sock,
- rxrpc_interceptor_t interceptor)
+void rxrpc_kernel_new_call_notification(
+ struct socket *sock,
+ rxrpc_notify_new_call_t notify_new_call,
+ rxrpc_discard_new_call_t discard_new_call)
{
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- _enter("");
- rx->interceptor = interceptor;
+ rx->notify_new_call = notify_new_call;
+ rx->discard_new_call = discard_new_call;
}
-
-EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages);
+EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
/*
* connect an RxRPC socket
@@ -391,6 +405,23 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
switch (rx->sk.sk_state) {
case RXRPC_UNBOUND:
+ rx->srx.srx_family = AF_RXRPC;
+ rx->srx.srx_service = 0;
+ rx->srx.transport_type = SOCK_DGRAM;
+ rx->srx.transport.family = rx->family;
+ switch (rx->family) {
+ case AF_INET:
+ rx->srx.transport_len = sizeof(struct sockaddr_in);
+ break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ rx->srx.transport_len = sizeof(struct sockaddr_in6);
+ break;
+#endif
+ default:
+ ret = -EAFNOSUPPORT;
+ goto error_unlock;
+ }
local = rxrpc_lookup_local(&rx->srx);
if (IS_ERR(local)) {
ret = PTR_ERR(local);
@@ -505,15 +536,16 @@ error:
static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
- unsigned int mask;
struct sock *sk = sock->sk;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ unsigned int mask;
sock_poll_wait(file, sk_sleep(sk), wait);
mask = 0;
/* the socket is readable if there are any messages waiting on the Rx
* queue */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!list_empty(&rx->recvmsg_q))
mask |= POLLIN | POLLRDNORM;
/* the socket is writable if there is space to add new data to the
@@ -540,7 +572,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
return -EAFNOSUPPORT;
/* we support transport protocol UDP/UDP6 only */
- if (protocol != PF_INET)
+ if (protocol != PF_INET &&
+ IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6)
return -EPROTONOSUPPORT;
if (sock->type != SOCK_DGRAM)
@@ -554,6 +587,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
return -ENOMEM;
sock_init_data(sock, sk);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sk->sk_state = RXRPC_UNBOUND;
sk->sk_write_space = rxrpc_write_space;
sk->sk_max_ack_backlog = 0;
@@ -563,9 +597,11 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
rx->family = protocol;
rx->calls = RB_ROOT;
- INIT_LIST_HEAD(&rx->listen_link);
- INIT_LIST_HEAD(&rx->secureq);
- INIT_LIST_HEAD(&rx->acceptq);
+ spin_lock_init(&rx->incoming_lock);
+ INIT_LIST_HEAD(&rx->sock_calls);
+ INIT_LIST_HEAD(&rx->to_be_accepted);
+ INIT_LIST_HEAD(&rx->recvmsg_q);
+ rwlock_init(&rx->recvmsg_lock);
rwlock_init(&rx->call_lock);
memset(&rx->srx, 0, sizeof(rx->srx));
@@ -574,6 +610,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
}
/*
+ * Kill all the calls on a socket and shut it down.
+ */
+static int rxrpc_shutdown(struct socket *sock, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ int ret = 0;
+
+ _enter("%p,%d", sk, flags);
+
+ if (flags != SHUT_RDWR)
+ return -EOPNOTSUPP;
+ if (sk->sk_state == RXRPC_CLOSE)
+ return -ESHUTDOWN;
+
+ lock_sock(sk);
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (sk->sk_state < RXRPC_CLOSE) {
+ sk->sk_state = RXRPC_CLOSE;
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ } else {
+ ret = -ESHUTDOWN;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ rxrpc_discard_prealloc(rx);
+
+ release_sock(sk);
+ return ret;
+}
+
+/*
* RxRPC socket destructor
*/
static void rxrpc_sock_destructor(struct sock *sk)
@@ -609,15 +678,14 @@ static int rxrpc_release_sock(struct sock *sk)
sk->sk_state = RXRPC_CLOSE;
spin_unlock_bh(&sk->sk_receive_queue.lock);
- ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
-
- if (!list_empty(&rx->listen_link)) {
- write_lock_bh(&rx->local->services_lock);
- list_del(&rx->listen_link);
- write_unlock_bh(&rx->local->services_lock);
+ if (rx->local && rcu_access_pointer(rx->local->service) == rx) {
+ write_lock(&rx->local->services_lock);
+ rcu_assign_pointer(rx->local->service, NULL);
+ write_unlock(&rx->local->services_lock);
}
/* try to flush out this socket */
+ rxrpc_discard_prealloc(rx);
rxrpc_release_calls_on_socket(rx);
flush_workqueue(rxrpc_workqueue);
rxrpc_purge_queue(&sk->sk_receive_queue);
@@ -666,7 +734,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
.poll = rxrpc_poll,
.ioctl = sock_no_ioctl,
.listen = rxrpc_listen,
- .shutdown = sock_no_shutdown,
+ .shutdown = rxrpc_shutdown,
.setsockopt = rxrpc_setsockopt,
.getsockopt = sock_no_getsockopt,
.sendmsg = rxrpc_sendmsg,
@@ -694,10 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = {
static int __init af_rxrpc_init(void)
{
int ret = -1;
+ unsigned int tmp;
BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
- rxrpc_epoch = get_seconds();
+ get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch));
+ rxrpc_epoch |= RXRPC_RANDOM_EPOCH;
+ get_random_bytes(&tmp, sizeof(tmp));
+ tmp &= 0x3fffffff;
+ if (tmp == 0)
+ tmp = 1;
+ idr_set_cursor(&rxrpc_client_conn_ids, tmp);
ret = -ENOMEM;
rxrpc_call_jar = kmem_cache_create(
@@ -788,7 +863,8 @@ static void __exit af_rxrpc_exit(void)
proto_unregister(&rxrpc_proto);
rxrpc_destroy_all_calls();
rxrpc_destroy_all_connections();
- ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
rxrpc_destroy_all_locals();
remove_proc_entry("rxrpc_conns", init_net.proc_net);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ff83fb1ddd47..f60e35576526 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -35,11 +35,23 @@ struct rxrpc_crypt {
#define rxrpc_queue_delayed_work(WS,D) \
queue_delayed_work(rxrpc_workqueue, (WS), (D))
-#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor)
-
struct rxrpc_connection;
/*
+ * Mark applied to socket buffers.
+ */
+enum rxrpc_skb_mark {
+ RXRPC_SKB_MARK_DATA, /* data message */
+ RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */
+ RXRPC_SKB_MARK_BUSY, /* server busy message */
+ RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */
+ RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */
+ RXRPC_SKB_MARK_NET_ERROR, /* network error message */
+ RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */
+ RXRPC_SKB_MARK_NEW_CALL, /* local error message */
+};
+
+/*
* sk_state for RxRPC sockets
*/
enum {
@@ -52,19 +64,44 @@ enum {
};
/*
+ * Service backlog preallocation.
+ *
+ * This contains circular buffers of preallocated peers, connections and calls
+ * for incoming service calls and their head and tail pointers. This allows
+ * calls to be set up in the data_ready handler, thereby avoiding the need to
+ * shuffle packets around so much.
+ */
+struct rxrpc_backlog {
+ unsigned short peer_backlog_head;
+ unsigned short peer_backlog_tail;
+ unsigned short conn_backlog_head;
+ unsigned short conn_backlog_tail;
+ unsigned short call_backlog_head;
+ unsigned short call_backlog_tail;
+#define RXRPC_BACKLOG_MAX 32
+ struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX];
+ struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX];
+ struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX];
+};
+
+/*
* RxRPC socket definition
*/
struct rxrpc_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
- rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */
+ rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
+ rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
struct rxrpc_local *local; /* local endpoint */
- struct list_head listen_link; /* link in the local endpoint's listen list */
- struct list_head secureq; /* calls awaiting connection security clearance */
- struct list_head acceptq; /* calls awaiting acceptance */
+ struct rxrpc_backlog *backlog; /* Preallocation for services */
+ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
+ struct list_head sock_calls; /* List of calls owned by this socket */
+ struct list_head to_be_accepted; /* calls awaiting acceptance */
+ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */
+ rwlock_t recvmsg_lock; /* Lock for recvmsg_q */
struct key *key; /* security for this socket */
struct key *securities; /* list of server security descriptors */
- struct rb_root calls; /* outstanding calls on this socket */
+ struct rb_root calls; /* User ID -> call mapping */
unsigned long flags;
#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
rwlock_t call_lock; /* lock for calls */
@@ -103,13 +140,11 @@ struct rxrpc_host_header {
* - max 48 bytes (struct sk_buff::cb)
*/
struct rxrpc_skb_priv {
- struct rxrpc_call *call; /* call with which associated */
- unsigned long resend_at; /* time in jiffies at which to resend */
union {
- unsigned int offset; /* offset into buffer of next read */
+ u8 nr_jumbo; /* Number of jumbo subpackets */
+ };
+ union {
int remain; /* amount of space remaining for next write */
- u32 error; /* network error code */
- bool need_resend; /* T if needs resending */
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -117,13 +152,6 @@ struct rxrpc_skb_priv {
#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
-enum rxrpc_command {
- RXRPC_CMD_SEND_DATA, /* send data message */
- RXRPC_CMD_SEND_ABORT, /* request abort generation */
- RXRPC_CMD_ACCEPT, /* [server] accept incoming call */
- RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */
-};
-
/*
* RxRPC security module interface
*/
@@ -150,7 +178,12 @@ struct rxrpc_security {
void *);
/* verify the security on a received packet */
- int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *);
+ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
+ unsigned int, unsigned int, rxrpc_seq_t, u16);
+
+ /* Locate the data in a received packet that has been verified. */
+ void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
+ unsigned int *, unsigned int *);
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
@@ -180,9 +213,8 @@ struct rxrpc_local {
struct list_head link;
struct socket *socket; /* my UDP socket */
struct work_struct processor;
- struct list_head services; /* services listening on this endpoint */
+ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
- struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
struct rb_root client_conns; /* Client connections by socket params */
@@ -220,10 +252,12 @@ struct rxrpc_peer {
/* calculated RTT cache */
#define RXRPC_RTT_CACHE_SIZE 32
- suseconds_t rtt; /* current RTT estimate (in uS) */
- unsigned int rtt_point; /* next entry at which to insert */
- unsigned int rtt_usage; /* amount of cache actually used */
- suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+ ktime_t rtt_last_req; /* Time of last RTT request */
+ u64 rtt; /* Current RTT estimate (in nS) */
+ u64 rtt_sum; /* Sum of cache contents */
+ u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
+ u8 rtt_cursor; /* next entry at which to insert */
+ u8 rtt_usage; /* amount of cache actually used */
};
/*
@@ -255,6 +289,9 @@ enum rxrpc_conn_flag {
RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */
RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */
RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */
+ RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */
+ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */
+ RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */
};
/*
@@ -265,17 +302,29 @@ enum rxrpc_conn_event {
};
/*
+ * The connection cache state.
+ */
+enum rxrpc_conn_cache_state {
+ RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */
+ RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */
+ RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
+ RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
+ RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
+ RXRPC_CONN__NR_CACHE_STATES
+};
+
+/*
* The connection protocol state.
*/
enum rxrpc_conn_proto_state {
RXRPC_CONN_UNUSED, /* Connection not yet attempted */
RXRPC_CONN_CLIENT, /* Client connection */
+ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */
RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */
RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */
RXRPC_CONN_SERVICE, /* Service secured connection */
RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */
RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */
- RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */
RXRPC_CONN__NR_STATES
};
@@ -288,23 +337,33 @@ struct rxrpc_connection {
struct rxrpc_conn_proto proto;
struct rxrpc_conn_parameters params;
- spinlock_t channel_lock;
+ atomic_t usage;
+ struct rcu_head rcu;
+ struct list_head cache_link;
+ spinlock_t channel_lock;
+ unsigned char active_chans; /* Mask of active channels */
+#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1)
+ struct list_head waiting_calls; /* Calls waiting for channels */
struct rxrpc_channel {
struct rxrpc_call __rcu *call; /* Active call */
u32 call_id; /* ID of current call */
u32 call_counter; /* Call ID counter */
u32 last_call; /* ID of last call */
- u32 last_result; /* Result of last call (0/abort) */
+ u8 last_type; /* Type of last packet */
+ u16 last_service_id;
+ union {
+ u32 last_seq;
+ u32 last_abort;
+ };
} channels[RXRPC_MAXCALLS];
- wait_queue_head_t channel_wq; /* queue to wait for channel to become available */
- struct rcu_head rcu;
struct work_struct processor; /* connection event processor */
union {
struct rb_node client_node; /* Node in local->client_conns */
struct rb_node service_node; /* Node in peer->service_conns */
};
+ struct list_head proc_link; /* link in procfs list */
struct list_head link; /* link in master connection list */
struct sk_buff_head rx_queue; /* received conn-level packets */
const struct rxrpc_security *security; /* applied security module */
@@ -313,21 +372,18 @@ struct rxrpc_connection {
struct rxrpc_crypt csum_iv; /* packet checksum base */
unsigned long flags;
unsigned long events;
- unsigned long put_time; /* Time at which last put */
+ unsigned long idle_timestamp; /* Time at which last became idle */
spinlock_t state_lock; /* state-change lock */
- atomic_t usage;
- enum rxrpc_conn_proto_state state : 8; /* current state of connection */
+ enum rxrpc_conn_cache_state cache_state;
+ enum rxrpc_conn_proto_state state; /* current state of connection */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
- int error; /* local error incurred */
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
- atomic_t hi_serial; /* highest serial number received */
- atomic_t avail_chans; /* number of channels available */
+ unsigned int hi_serial; /* highest serial number received */
+ u32 security_nonce; /* response re-use preventer */
u8 size_align; /* data size alignment (for security) */
- u8 header_size; /* rxrpc + security header size */
u8 security_size; /* security header size */
- u32 security_nonce; /* response re-use preventer */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
@@ -337,37 +393,25 @@ struct rxrpc_connection {
*/
enum rxrpc_call_flag {
RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */
- RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */
- RXRPC_CALL_RCVD_LAST, /* all packets received */
- RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */
- RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */
- RXRPC_CALL_PROC_BUSY, /* the processor is busy */
- RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */
RXRPC_CALL_HAS_USERID, /* has a user ID attached */
- RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */
+ RXRPC_CALL_IS_SERVICE, /* Call is service call */
+ RXRPC_CALL_EXPOSED, /* The call was exposed to the world */
+ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
+ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
+ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
+ RXRPC_CALL_PINGING, /* Ping in process */
+ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
};
/*
* Events that can be raised on a call.
*/
enum rxrpc_call_event {
- RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */
- RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */
- RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */
- RXRPC_CALL_EV_RCVD_ERROR, /* network error received */
- RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */
RXRPC_CALL_EV_ACK, /* need to generate ACK */
- RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */
RXRPC_CALL_EV_ABORT, /* need to generate abort */
- RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */
- RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */
+ RXRPC_CALL_EV_TIMER, /* Timer expired */
RXRPC_CALL_EV_RESEND, /* Tx resend required */
- RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */
- RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */
- RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */
- RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */
- RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */
- RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */
+ RXRPC_CALL_EV_PING, /* Ping send required */
};
/*
@@ -379,20 +423,38 @@ enum rxrpc_call_state {
RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
- RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */
+ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */
RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
- RXRPC_CALL_COMPLETE, /* - call completed */
- RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */
+ RXRPC_CALL_COMPLETE, /* - call complete */
+ NR__RXRPC_CALL_STATES
+};
+
+/*
+ * Call completion condition (state == RXRPC_CALL_COMPLETE).
+ */
+enum rxrpc_call_completion {
+ RXRPC_CALL_SUCCEEDED, /* - Normal termination */
RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */
RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */
+ RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */
RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */
- RXRPC_CALL_DEAD, /* - call is dead */
- NR__RXRPC_CALL_STATES
+ NR__RXRPC_CALL_COMPLETIONS
+};
+
+/*
+ * Call Tx congestion management modes.
+ */
+enum rxrpc_congest_mode {
+ RXRPC_CALL_SLOW_START,
+ RXRPC_CALL_CONGEST_AVOIDANCE,
+ RXRPC_CALL_PACKET_LOSS,
+ RXRPC_CALL_FAST_RETRANSMIT,
+ NR__RXRPC_CONGEST_MODES
};
/*
@@ -402,92 +464,335 @@ enum rxrpc_call_state {
struct rxrpc_call {
struct rcu_head rcu;
struct rxrpc_connection *conn; /* connection carrying call */
- struct rxrpc_sock *socket; /* socket responsible */
- struct timer_list lifetimer; /* lifetime remaining on call */
- struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */
- struct timer_list ack_timer; /* ACK generation timer */
- struct timer_list resend_timer; /* Tx resend timer */
- struct work_struct destroyer; /* call destroyer */
- struct work_struct processor; /* packet processor and ACK generator */
+ struct rxrpc_peer *peer; /* Peer record for remote address */
+ struct rxrpc_sock __rcu *socket; /* socket responsible */
+ ktime_t ack_at; /* When deferred ACK needs to happen */
+ ktime_t resend_at; /* When next resend needs to happen */
+ ktime_t ping_at; /* When next to send a ping */
+ ktime_t expire_at; /* When the call times out */
+ struct timer_list timer; /* Combined event timer */
+ struct work_struct processor; /* Event processor */
+ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
struct list_head link; /* link in master call list */
+ struct list_head chan_wait_link; /* Link in conn->waiting_calls */
struct hlist_node error_link; /* link in error distribution list */
- struct list_head accept_link; /* calls awaiting acceptance */
- struct rb_node sock_node; /* node in socket call tree */
- struct sk_buff_head rx_queue; /* received packets */
- struct sk_buff_head rx_oos_queue; /* packets received out of sequence */
+ struct list_head accept_link; /* Link in rx->acceptq */
+ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
+ struct list_head sock_link; /* Link in rx->sock_calls */
+ struct rb_node sock_node; /* Node in rx->calls */
struct sk_buff *tx_pending; /* Tx socket buffer being filled */
- wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */
+ wait_queue_head_t waitq; /* Wait queue for channel or Tx */
__be32 crypto_buf[2]; /* Temporary packet crypto buffer */
unsigned long user_call_ID; /* user-defined call ID */
- unsigned long creation_jif; /* time of call creation */
unsigned long flags;
unsigned long events;
spinlock_t lock;
rwlock_t state_lock; /* lock for state transition */
- atomic_t usage;
- atomic_t skb_count; /* Outstanding packets on this call */
- atomic_t sequence; /* Tx data packet sequence counter */
- u32 local_abort; /* local abort code */
- u32 remote_abort; /* remote abort code */
- int error_report; /* Network error (ICMP/local transport) */
+ u32 abort_code; /* Local/remote abort code */
int error; /* Local error incurred */
- enum rxrpc_call_state state : 8; /* current state of call */
+ enum rxrpc_call_state state; /* current state of call */
+ enum rxrpc_call_completion completion; /* Call completion condition */
+ atomic_t usage;
+ u16 service_id; /* service ID */
+ u8 security_ix; /* Security type */
+ u32 call_id; /* call ID on connection */
+ u32 cid; /* connection ID plus channel index */
int debug_id; /* debug ID for printks */
- u8 channel; /* connection channel occupied by this call */
-
- /* transmission-phase ACK management */
- u8 acks_head; /* offset into window of first entry */
- u8 acks_tail; /* offset into window of last entry */
- u8 acks_winsz; /* size of un-ACK'd window */
- u8 acks_unacked; /* lowest unacked packet in last ACK received */
- int acks_latest; /* serial number of latest ACK received */
- rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */
- unsigned long *acks_window; /* sent packet window
- * - elements are pointers with LSB set if ACK'd
+ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
+ unsigned short rx_pkt_len; /* Current recvmsg packet len */
+
+ /* Rx/Tx circular buffer, depending on phase.
+ *
+ * In the Rx phase, packets are annotated with 0 or the number of the
+ * segment of a jumbo packet each buffer refers to. There can be up to
+ * 47 segments in a maximum-size UDP packet.
+ *
+ * In the Tx phase, packets are annotated with which buffers have been
+ * acked.
+ */
+#define RXRPC_RXTX_BUFF_SIZE 64
+#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
+#define RXRPC_INIT_RX_WINDOW_SIZE 32
+ struct sk_buff **rxtx_buffer;
+ u8 *rxtx_annotations;
+#define RXRPC_TX_ANNO_ACK 0
+#define RXRPC_TX_ANNO_UNACK 1
+#define RXRPC_TX_ANNO_NAK 2
+#define RXRPC_TX_ANNO_RETRANS 3
+#define RXRPC_TX_ANNO_MASK 0x03
+#define RXRPC_TX_ANNO_LAST 0x04
+#define RXRPC_TX_ANNO_RESENT 0x08
+
+#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */
+#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */
+#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
+ rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but
+ * not hard-ACK'd packet follows this.
+ */
+ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
+
+ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS
+ * is fixed, we keep these numbers in terms of segments (ie. DATA
+ * packets) rather than bytes.
+ */
+#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
+ u8 cong_cwnd; /* Congestion window size */
+ u8 cong_extra; /* Extra to send for congestion management */
+ u8 cong_ssthresh; /* Slow-start threshold */
+ enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
+ u8 cong_dup_acks; /* Count of ACKs showing missing packets */
+ u8 cong_cumul_acks; /* Cumulative ACK count */
+ ktime_t cong_tstamp; /* Last time cwnd was changed */
+
+ rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not
+ * consumed packet follows this.
*/
+ rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
+ rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
+ u8 rx_winsize; /* Size of Rx window */
+ u8 tx_winsize; /* Maximum size of Tx window */
+ bool tx_phase; /* T if transmission phase, F if receive phase */
+ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
/* receive-phase ACK management */
- rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */
- rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */
- rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */
- rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */
- rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */
- rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */
- rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
u8 ackr_reason; /* reason to ACK */
+ u16 ackr_skew; /* skew on packet being ACK'd */
rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
- atomic_t ackr_not_idle; /* number of packets in Rx queue */
+ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
+ rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */
+ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */
- /* received packet records, 1 bit per record */
-#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
- unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
+ /* ping management */
+ rxrpc_serial_t ping_serial; /* Last ping sent */
+ ktime_t ping_time; /* Time last ping sent */
- u8 in_clientflag; /* Copy of conn->in_clientflag */
- struct rxrpc_local *local; /* Local endpoint. */
- u32 call_id; /* call ID on connection */
- u32 cid; /* connection ID plus channel index */
- u32 epoch; /* epoch of this connection */
- u16 service_id; /* service ID */
+ /* transmission-phase ACK management */
+ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
+ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
+ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
};
/*
- * locally abort an RxRPC call
+ * Summary of a new ACK and the changes it made to the Tx buffer packet states.
*/
-static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
-{
- write_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE) {
- call->local_abort = abort_code;
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- set_bit(RXRPC_CALL_EV_ABORT, &call->events);
- }
- write_unlock_bh(&call->state_lock);
-}
+struct rxrpc_ack_summary {
+ u8 ack_reason;
+ u8 nr_acks; /* Number of ACKs in packet */
+ u8 nr_nacks; /* Number of NACKs in packet */
+ u8 nr_new_acks; /* Number of new ACKs in packet */
+ u8 nr_new_nacks; /* Number of new NACKs in packet */
+ u8 nr_rot_new_acks; /* Number of rotated new ACKs */
+ bool new_low_nack; /* T if new low NACK found */
+ bool retrans_timeo; /* T if reTx due to timeout happened */
+ u8 flight_size; /* Number of unreceived transmissions */
+ /* Place to stash values for tracing */
+ enum rxrpc_congest_mode mode:8;
+ u8 cwnd;
+ u8 ssthresh;
+ u8 dup_acks;
+ u8 cumulative_acks;
+};
+
+enum rxrpc_skb_trace {
+ rxrpc_skb_rx_cleaned,
+ rxrpc_skb_rx_freed,
+ rxrpc_skb_rx_got,
+ rxrpc_skb_rx_lost,
+ rxrpc_skb_rx_received,
+ rxrpc_skb_rx_rotated,
+ rxrpc_skb_rx_purged,
+ rxrpc_skb_rx_seen,
+ rxrpc_skb_tx_cleaned,
+ rxrpc_skb_tx_freed,
+ rxrpc_skb_tx_got,
+ rxrpc_skb_tx_new,
+ rxrpc_skb_tx_rotated,
+ rxrpc_skb_tx_seen,
+ rxrpc_skb__nr_trace
+};
+
+extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
+
+enum rxrpc_conn_trace {
+ rxrpc_conn_new_client,
+ rxrpc_conn_new_service,
+ rxrpc_conn_queued,
+ rxrpc_conn_seen,
+ rxrpc_conn_got,
+ rxrpc_conn_put_client,
+ rxrpc_conn_put_service,
+ rxrpc_conn__nr_trace
+};
+
+extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
+
+enum rxrpc_client_trace {
+ rxrpc_client_activate_chans,
+ rxrpc_client_alloc,
+ rxrpc_client_chan_activate,
+ rxrpc_client_chan_disconnect,
+ rxrpc_client_chan_pass,
+ rxrpc_client_chan_unstarted,
+ rxrpc_client_cleanup,
+ rxrpc_client_count,
+ rxrpc_client_discard,
+ rxrpc_client_duplicate,
+ rxrpc_client_exposed,
+ rxrpc_client_replace,
+ rxrpc_client_to_active,
+ rxrpc_client_to_culled,
+ rxrpc_client_to_idle,
+ rxrpc_client_to_inactive,
+ rxrpc_client_to_waiting,
+ rxrpc_client_uncount,
+ rxrpc_client__nr_trace
+};
+
+extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
+extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
+
+enum rxrpc_call_trace {
+ rxrpc_call_new_client,
+ rxrpc_call_new_service,
+ rxrpc_call_queued,
+ rxrpc_call_queued_ref,
+ rxrpc_call_seen,
+ rxrpc_call_connected,
+ rxrpc_call_release,
+ rxrpc_call_got,
+ rxrpc_call_got_userid,
+ rxrpc_call_got_kernel,
+ rxrpc_call_put,
+ rxrpc_call_put_userid,
+ rxrpc_call_put_kernel,
+ rxrpc_call_put_noqueue,
+ rxrpc_call_error,
+ rxrpc_call__nr_trace
+};
+
+extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
+
+enum rxrpc_transmit_trace {
+ rxrpc_transmit_wait,
+ rxrpc_transmit_queue,
+ rxrpc_transmit_queue_last,
+ rxrpc_transmit_rotate,
+ rxrpc_transmit_rotate_last,
+ rxrpc_transmit_await_reply,
+ rxrpc_transmit_end,
+ rxrpc_transmit__nr_trace
+};
+
+extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
+
+enum rxrpc_receive_trace {
+ rxrpc_receive_incoming,
+ rxrpc_receive_queue,
+ rxrpc_receive_queue_last,
+ rxrpc_receive_front,
+ rxrpc_receive_rotate,
+ rxrpc_receive_end,
+ rxrpc_receive__nr_trace
+};
+
+extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
+
+enum rxrpc_recvmsg_trace {
+ rxrpc_recvmsg_enter,
+ rxrpc_recvmsg_wait,
+ rxrpc_recvmsg_dequeue,
+ rxrpc_recvmsg_hole,
+ rxrpc_recvmsg_next,
+ rxrpc_recvmsg_cont,
+ rxrpc_recvmsg_full,
+ rxrpc_recvmsg_data_return,
+ rxrpc_recvmsg_terminal,
+ rxrpc_recvmsg_to_be_accepted,
+ rxrpc_recvmsg_return,
+ rxrpc_recvmsg__nr_trace
+};
+
+extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
+
+enum rxrpc_rtt_tx_trace {
+ rxrpc_rtt_tx_ping,
+ rxrpc_rtt_tx_data,
+ rxrpc_rtt_tx__nr_trace
+};
+
+extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
+
+enum rxrpc_rtt_rx_trace {
+ rxrpc_rtt_rx_ping_response,
+ rxrpc_rtt_rx_requested_ack,
+ rxrpc_rtt_rx__nr_trace
+};
+
+extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
+
+enum rxrpc_timer_trace {
+ rxrpc_timer_begin,
+ rxrpc_timer_init_for_reply,
+ rxrpc_timer_init_for_send_reply,
+ rxrpc_timer_expired,
+ rxrpc_timer_set_for_ack,
+ rxrpc_timer_set_for_ping,
+ rxrpc_timer_set_for_resend,
+ rxrpc_timer_set_for_send,
+ rxrpc_timer__nr_trace
+};
+
+extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
+
+enum rxrpc_propose_ack_trace {
+ rxrpc_propose_ack_client_tx_end,
+ rxrpc_propose_ack_input_data,
+ rxrpc_propose_ack_ping_for_lost_ack,
+ rxrpc_propose_ack_ping_for_lost_reply,
+ rxrpc_propose_ack_ping_for_params,
+ rxrpc_propose_ack_processing_op,
+ rxrpc_propose_ack_respond_to_ack,
+ rxrpc_propose_ack_respond_to_ping,
+ rxrpc_propose_ack_retry_tx,
+ rxrpc_propose_ack_rotate_rx,
+ rxrpc_propose_ack_terminal_ack,
+ rxrpc_propose_ack__nr_trace
+};
+
+enum rxrpc_propose_ack_outcome {
+ rxrpc_propose_ack_use,
+ rxrpc_propose_ack_update,
+ rxrpc_propose_ack_subsume,
+ rxrpc_propose_ack__nr_outcomes
+};
+
+extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
+extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
+
+enum rxrpc_congest_change {
+ rxrpc_cong_begin_retransmission,
+ rxrpc_cong_cleared_nacks,
+ rxrpc_cong_new_low_nack,
+ rxrpc_cong_no_change,
+ rxrpc_cong_progress,
+ rxrpc_cong_retransmit_again,
+ rxrpc_cong_rtt_window_end,
+ rxrpc_cong_saw_nack,
+ rxrpc_congest__nr_change
+};
+
+extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
+extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
+
+extern const char *const rxrpc_pkts[];
+extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
+
+#include <trace/events/rxrpc.h>
/*
* af_rxrpc.c
*/
-extern atomic_t rxrpc_n_skbs;
+extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
extern u32 rxrpc_epoch;
extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
@@ -495,70 +800,179 @@ extern struct workqueue_struct *rxrpc_workqueue;
/*
* call_accept.c
*/
+int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
+void rxrpc_discard_prealloc(struct rxrpc_sock *);
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
+ struct rxrpc_connection *,
+ struct sk_buff *);
void rxrpc_accept_incoming_calls(struct rxrpc_local *);
-struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long);
+struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
+ rxrpc_notify_rx_t);
int rxrpc_reject_call(struct rxrpc_sock *);
/*
* call_event.c
*/
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
+void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
+void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
+ enum rxrpc_propose_ack_trace);
void rxrpc_process_call(struct work_struct *);
/*
* call_object.c
*/
+extern const char *const rxrpc_call_states[];
+extern const char *const rxrpc_call_completions[];
extern unsigned int rxrpc_max_call_lifetime;
-extern unsigned int rxrpc_dead_call_expiry;
extern struct kmem_cache *rxrpc_call_jar;
extern struct list_head rxrpc_calls;
extern rwlock_t rxrpc_call_lock;
struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
+struct rxrpc_call *rxrpc_alloc_call(gfp_t);
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
struct rxrpc_conn_parameters *,
struct sockaddr_rxrpc *,
unsigned long, gfp_t);
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
- struct rxrpc_connection *,
- struct sk_buff *);
-void rxrpc_release_call(struct rxrpc_call *);
+void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
+ struct sk_buff *);
+void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
-void __rxrpc_put_call(struct rxrpc_call *);
+bool __rxrpc_queue_call(struct rxrpc_call *);
+bool rxrpc_queue_call(struct rxrpc_call *);
+void rxrpc_see_call(struct rxrpc_call *);
+void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
+void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
+void rxrpc_cleanup_call(struct rxrpc_call *);
void __exit rxrpc_destroy_all_calls(void);
+static inline bool rxrpc_is_service_call(const struct rxrpc_call *call)
+{
+ return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags);
+}
+
+static inline bool rxrpc_is_client_call(const struct rxrpc_call *call)
+{
+ return !rxrpc_is_service_call(call);
+}
+
+/*
+ * Transition a call to the complete state.
+ */
+static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
+ enum rxrpc_call_completion compl,
+ u32 abort_code,
+ int error)
+{
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ call->abort_code = abort_code;
+ call->error = error;
+ call->completion = compl,
+ call->state = RXRPC_CALL_COMPLETE;
+ wake_up(&call->waitq);
+ return true;
+ }
+ return false;
+}
+
+static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
+ enum rxrpc_call_completion compl,
+ u32 abort_code,
+ int error)
+{
+ bool ret;
+
+ write_lock_bh(&call->state_lock);
+ ret = __rxrpc_set_call_completion(call, compl, abort_code, error);
+ write_unlock_bh(&call->state_lock);
+ return ret;
+}
+
+/*
+ * Record that a call successfully completed.
+ */
+static inline bool __rxrpc_call_completed(struct rxrpc_call *call)
+{
+ return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
+}
+
+static inline bool rxrpc_call_completed(struct rxrpc_call *call)
+{
+ bool ret;
+
+ write_lock_bh(&call->state_lock);
+ ret = __rxrpc_call_completed(call);
+ write_unlock_bh(&call->state_lock);
+ return ret;
+}
+
+/*
+ * Record that a call is locally aborted.
+ */
+static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+ rxrpc_seq_t seq,
+ u32 abort_code, int error)
+{
+ trace_rxrpc_abort(why, call->cid, call->call_id, seq,
+ abort_code, error);
+ return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
+ abort_code, error);
+}
+
+static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+ rxrpc_seq_t seq, u32 abort_code, int error)
+{
+ bool ret;
+
+ write_lock_bh(&call->state_lock);
+ ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
+ write_unlock_bh(&call->state_lock);
+ return ret;
+}
+
/*
* conn_client.c
*/
+extern unsigned int rxrpc_max_client_connections;
+extern unsigned int rxrpc_reap_client_connections;
+extern unsigned int rxrpc_conn_idle_client_expiry;
+extern unsigned int rxrpc_conn_idle_client_fast_expiry;
extern struct idr rxrpc_client_conn_ids;
void rxrpc_destroy_client_conn_ids(void);
int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
struct sockaddr_rxrpc *, gfp_t);
-void rxrpc_unpublish_client_conn(struct rxrpc_connection *);
+void rxrpc_expose_client_call(struct rxrpc_call *);
+void rxrpc_disconnect_client_call(struct rxrpc_call *);
+void rxrpc_put_client_conn(struct rxrpc_connection *);
+void __exit rxrpc_destroy_all_client_connections(void);
/*
* conn_event.c
*/
void rxrpc_process_connection(struct work_struct *);
-void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
-void rxrpc_reject_packets(struct rxrpc_local *);
/*
* conn_object.c
*/
extern unsigned int rxrpc_connection_expiry;
extern struct list_head rxrpc_connections;
+extern struct list_head rxrpc_connection_proc_list;
extern rwlock_t rxrpc_connection_lock;
int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
struct sk_buff *);
-void __rxrpc_disconnect_call(struct rxrpc_call *);
+void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
void rxrpc_disconnect_call(struct rxrpc_call *);
-void rxrpc_put_connection(struct rxrpc_connection *);
+void rxrpc_kill_connection(struct rxrpc_connection *);
+bool rxrpc_queue_conn(struct rxrpc_connection *);
+void rxrpc_see_connection(struct rxrpc_connection *);
+void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
+void rxrpc_put_service_conn(struct rxrpc_connection *);
void __exit rxrpc_destroy_all_connections(void);
static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
@@ -571,24 +985,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
return !rxrpc_conn_is_client(conn);
}
-static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
+static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
{
- atomic_inc(&conn->usage);
-}
-
-static inline
-struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
-{
- return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
-}
+ if (!conn)
+ return;
-static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
-{
- if (!rxrpc_get_connection_maybe(conn))
- return false;
- if (!rxrpc_queue_work(&conn->processor))
- rxrpc_put_connection(conn);
- return true;
+ if (rxrpc_conn_is_client(conn))
+ rxrpc_put_client_conn(conn);
+ else
+ rxrpc_put_service_conn(conn);
}
/*
@@ -596,17 +1001,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
*/
struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
struct sk_buff *);
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
- struct sockaddr_rxrpc *,
- struct sk_buff *);
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
+void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *);
void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
* input.c
*/
void rxrpc_data_ready(struct sock *);
-int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
-void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
/*
* insecure.c
@@ -668,25 +1070,25 @@ extern unsigned int rxrpc_idle_ack_delay;
extern unsigned int rxrpc_rx_window_size;
extern unsigned int rxrpc_rx_mtu;
extern unsigned int rxrpc_rx_jumbo_max;
+extern unsigned int rxrpc_resend_timeout;
-extern const char *const rxrpc_pkts[];
extern const s8 rxrpc_ack_priority[];
-extern const char *rxrpc_acks(u8 reason);
-
/*
* output.c
*/
-extern unsigned int rxrpc_resend_timeout;
-
-int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
-int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
+int rxrpc_send_ack_packet(struct rxrpc_call *, bool);
+int rxrpc_send_abort_packet(struct rxrpc_call *);
+int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
+void rxrpc_reject_packets(struct rxrpc_local *);
/*
* peer_event.c
*/
void rxrpc_error_report(struct sock *);
void rxrpc_peer_error_distributor(struct work_struct *);
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+ rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
/*
* peer_object.c
@@ -696,10 +1098,13 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
struct sockaddr_rxrpc *, gfp_t);
struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
+ struct rxrpc_peer *);
-static inline void rxrpc_get_peer(struct rxrpc_peer *peer)
+static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
{
atomic_inc(&peer->usage);
+ return peer;
}
static inline
@@ -718,14 +1123,13 @@ static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
/*
* proc.c
*/
-extern const char *const rxrpc_call_states[];
extern const struct file_operations rxrpc_call_seq_fops;
extern const struct file_operations rxrpc_connection_seq_fops;
/*
* recvmsg.c
*/
-void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
+void rxrpc_notify_socket(struct rxrpc_call *);
int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
/*
@@ -744,9 +1148,21 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *);
int rxrpc_init_server_conn_security(struct rxrpc_connection *);
/*
+ * sendmsg.c
+ */
+int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
+
+/*
* skbuff.c
*/
+void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
void rxrpc_packet_destructor(struct sk_buff *);
+void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_purge_queue(struct sk_buff_head *);
/*
* sysctl.c
@@ -764,6 +1180,23 @@ static inline void rxrpc_sysctl_exit(void) {}
*/
int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
+static inline bool before(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) < 0;
+}
+static inline bool before_eq(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) <= 0;
+}
+static inline bool after(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) > 0;
+}
+static inline bool after_eq(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) >= 0;
+}
+
/*
* debug tracing
*/
@@ -846,11 +1279,12 @@ do { \
#define ASSERTCMP(X, OP, Y) \
do { \
- unsigned long _x = (unsigned long)(X); \
- unsigned long _y = (unsigned long)(Y); \
+ __typeof__(X) _x = (X); \
+ __typeof__(Y) _y = (__typeof__(X))(Y); \
if (unlikely(!(_x OP _y))) { \
- pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
- _x, _x, #OP, _y, _y); \
+ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
+ (unsigned long)_x, (unsigned long)_x, #OP, \
+ (unsigned long)_y, (unsigned long)_y); \
BUG(); \
} \
} while (0)
@@ -865,11 +1299,12 @@ do { \
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
- unsigned long _x = (unsigned long)(X); \
- unsigned long _y = (unsigned long)(Y); \
+ __typeof__(X) _x = (X); \
+ __typeof__(Y) _y = (__typeof__(X))(Y); \
if (unlikely((C) && !(_x OP _y))) { \
pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
- _x, _x, #OP, _y, _y); \
+ (unsigned long)_x, (unsigned long)_x, #OP, \
+ (unsigned long)_y, (unsigned long)_y); \
BUG(); \
} \
} while (0)
@@ -893,54 +1328,3 @@ do { \
} while (0)
#endif /* __KDEBUGALL */
-
-/*
- * socket buffer accounting / leak finding
- */
-static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn)
-{
- //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
- //atomic_inc(&rxrpc_n_skbs);
-}
-
-#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__)
-
-static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn)
-{
- //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
- //atomic_dec(&rxrpc_n_skbs);
-}
-
-#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__)
-
-static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn)
-{
- if (skb) {
- CHECK_SLAB_OKAY(&skb->users);
- //_net("free skb %p %s [%d]",
- // skb, fn, atomic_read(&rxrpc_n_skbs));
- //atomic_dec(&rxrpc_n_skbs);
- kfree_skb(skb);
- }
-}
-
-#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__)
-
-static inline void rxrpc_purge_queue(struct sk_buff_head *list)
-{
- struct sk_buff *skb;
- while ((skb = skb_dequeue((list))) != NULL)
- rxrpc_free_skb(skb);
-}
-
-#define rxrpc_get_call(CALL) \
-do { \
- CHECK_SLAB_OKAY(&(CALL)->usage); \
- if (atomic_inc_return(&(CALL)->usage) == 1) \
- BUG(); \
-} while (0)
-
-#define rxrpc_put_call(CALL) \
-do { \
- __rxrpc_put_call(CALL); \
-} while (0)
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 9bae21e66d65..832d854c2d5c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -20,265 +20,409 @@
#include <linux/in6.h>
#include <linux/icmp.h>
#include <linux/gfp.h>
+#include <linux/circ_buf.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
#include "ar-internal.h"
/*
- * generate a connection-level abort
+ * Preallocate a single service call, connection and peer and, if possible,
+ * give them a user ID and attach the user's side of the ID to them.
*/
-static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
- struct rxrpc_wire_header *whdr)
+static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
+ struct rxrpc_backlog *b,
+ rxrpc_notify_rx_t notify_rx,
+ rxrpc_user_attach_call_t user_attach_call,
+ unsigned long user_call_ID, gfp_t gfp)
{
- struct msghdr msg;
- struct kvec iov[1];
- size_t len;
- int ret;
+ const void *here = __builtin_return_address(0);
+ struct rxrpc_call *call;
+ int max, tmp;
+ unsigned int size = RXRPC_BACKLOG_MAX;
+ unsigned int head, tail, call_head, call_tail;
+
+ max = rx->sk.sk_max_ack_backlog;
+ tmp = rx->sk.sk_ack_backlog;
+ if (tmp >= max) {
+ _leave(" = -ENOBUFS [full %u]", max);
+ return -ENOBUFS;
+ }
+ max -= tmp;
+
+ /* We don't need more conns and peers than we have calls, but on the
+ * other hand, we shouldn't ever use more peers than conns or conns
+ * than calls.
+ */
+ call_head = b->call_backlog_head;
+ call_tail = READ_ONCE(b->call_backlog_tail);
+ tmp = CIRC_CNT(call_head, call_tail, size);
+ if (tmp >= max) {
+ _leave(" = -ENOBUFS [enough %u]", tmp);
+ return -ENOBUFS;
+ }
+ max = tmp + 1;
+
+ head = b->peer_backlog_head;
+ tail = READ_ONCE(b->peer_backlog_tail);
+ if (CIRC_CNT(head, tail, size) < max) {
+ struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp);
+ if (!peer)
+ return -ENOMEM;
+ b->peer_backlog[head] = peer;
+ smp_store_release(&b->peer_backlog_head,
+ (head + 1) & (size - 1));
+ }
- _enter("%d,,", local->debug_id);
+ head = b->conn_backlog_head;
+ tail = READ_ONCE(b->conn_backlog_tail);
+ if (CIRC_CNT(head, tail, size) < max) {
+ struct rxrpc_connection *conn;
- whdr->type = RXRPC_PACKET_TYPE_BUSY;
- whdr->serial = htonl(1);
+ conn = rxrpc_prealloc_service_connection(gfp);
+ if (!conn)
+ return -ENOMEM;
+ b->conn_backlog[head] = conn;
+ smp_store_release(&b->conn_backlog_head,
+ (head + 1) & (size - 1));
- msg.msg_name = &srx->transport.sin;
- msg.msg_namelen = sizeof(srx->transport.sin);
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
+ trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ atomic_read(&conn->usage), here);
+ }
- iov[0].iov_base = whdr;
- iov[0].iov_len = sizeof(*whdr);
+ /* Now it gets complicated, because calls get registered with the
+ * socket here, particularly if a user ID is preassigned by the user.
+ */
+ call = rxrpc_alloc_call(gfp);
+ if (!call)
+ return -ENOMEM;
+ call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
+ call->state = RXRPC_CALL_SERVER_PREALLOC;
- len = iov[0].iov_len;
+ trace_rxrpc_call(call, rxrpc_call_new_service,
+ atomic_read(&call->usage),
+ here, (const void *)user_call_ID);
- _proto("Tx BUSY %%1");
+ write_lock(&rx->call_lock);
+ if (user_attach_call) {
+ struct rxrpc_call *xcall;
+ struct rb_node *parent, **pp;
+
+ /* Check the user ID isn't already in use */
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ xcall = rb_entry(parent, struct rxrpc_call, sock_node);
+ if (user_call_ID < call->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ goto id_in_use;
+ }
- ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
- if (ret < 0) {
- _leave(" = -EAGAIN [sendmsg failed: %d]", ret);
- return -EAGAIN;
+ call->user_call_ID = user_call_ID;
+ call->notify_rx = notify_rx;
+ rxrpc_get_call(call, rxrpc_call_got_kernel);
+ user_attach_call(call, user_call_ID);
+ rxrpc_get_call(call, rxrpc_call_got_userid);
+ rb_link_node(&call->sock_node, parent, pp);
+ rb_insert_color(&call->sock_node, &rx->calls);
+ set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
}
- _leave(" = 0");
+ list_add(&call->sock_link, &rx->sock_calls);
+
+ write_unlock(&rx->call_lock);
+
+ write_lock(&rxrpc_call_lock);
+ list_add_tail(&call->link, &rxrpc_calls);
+ write_unlock(&rxrpc_call_lock);
+
+ b->call_backlog[call_head] = call;
+ smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1));
+ _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID);
return 0;
+
+id_in_use:
+ write_unlock(&rx->call_lock);
+ rxrpc_cleanup_call(call);
+ _leave(" = -EBADSLT");
+ return -EBADSLT;
}
/*
- * accept an incoming call that needs peer, transport and/or connection setting
- * up
+ * Preallocate sufficient service connections, calls and peers to cover the
+ * entire backlog of a socket. When a new call comes in, if we don't have
+ * sufficient of each available, the call gets rejected as busy or ignored.
+ *
+ * The backlog is replenished when a connection is accepted or rejected.
*/
-static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
- struct rxrpc_sock *rx,
- struct sk_buff *skb,
- struct sockaddr_rxrpc *srx)
+int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp)
{
- struct rxrpc_connection *conn;
- struct rxrpc_skb_priv *sp, *nsp;
- struct rxrpc_call *call;
- struct sk_buff *notification;
- int ret;
+ struct rxrpc_backlog *b = rx->backlog;
- _enter("");
+ if (!b) {
+ b = kzalloc(sizeof(struct rxrpc_backlog), gfp);
+ if (!b)
+ return -ENOMEM;
+ rx->backlog = b;
+ }
+
+ if (rx->discard_new_call)
+ return 0;
+
+ while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0)
+ ;
- sp = rxrpc_skb(skb);
+ return 0;
+}
- /* get a notification message to send to the server app */
- notification = alloc_skb(0, GFP_NOFS);
- if (!notification) {
- _debug("no memory");
- ret = -ENOMEM;
- goto error_nofree;
+/*
+ * Discard the preallocation on a service.
+ */
+void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
+{
+ struct rxrpc_backlog *b = rx->backlog;
+ unsigned int size = RXRPC_BACKLOG_MAX, head, tail;
+
+ if (!b)
+ return;
+ rx->backlog = NULL;
+
+ /* Make sure that there aren't any incoming calls in progress before we
+ * clear the preallocation buffers.
+ */
+ spin_lock_bh(&rx->incoming_lock);
+ spin_unlock_bh(&rx->incoming_lock);
+
+ head = b->peer_backlog_head;
+ tail = b->peer_backlog_tail;
+ while (CIRC_CNT(head, tail, size) > 0) {
+ struct rxrpc_peer *peer = b->peer_backlog[tail];
+ kfree(peer);
+ tail = (tail + 1) & (size - 1);
}
- rxrpc_new_skb(notification);
- notification->mark = RXRPC_SKB_MARK_NEW_CALL;
-
- conn = rxrpc_incoming_connection(local, srx, skb);
- if (IS_ERR(conn)) {
- _debug("no conn");
- ret = PTR_ERR(conn);
- goto error;
+
+ head = b->conn_backlog_head;
+ tail = b->conn_backlog_tail;
+ while (CIRC_CNT(head, tail, size) > 0) {
+ struct rxrpc_connection *conn = b->conn_backlog[tail];
+ write_lock(&rxrpc_connection_lock);
+ list_del(&conn->link);
+ list_del(&conn->proc_link);
+ write_unlock(&rxrpc_connection_lock);
+ kfree(conn);
+ tail = (tail + 1) & (size - 1);
}
- call = rxrpc_incoming_call(rx, conn, skb);
- rxrpc_put_connection(conn);
- if (IS_ERR(call)) {
- _debug("no call");
- ret = PTR_ERR(call);
- goto error;
+ head = b->call_backlog_head;
+ tail = b->call_backlog_tail;
+ while (CIRC_CNT(head, tail, size) > 0) {
+ struct rxrpc_call *call = b->call_backlog[tail];
+ if (rx->discard_new_call) {
+ _debug("discard %lx", call->user_call_ID);
+ rx->discard_new_call(call, call->user_call_ID);
+ rxrpc_put_call(call, rxrpc_call_put_kernel);
+ }
+ rxrpc_call_completed(call);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ tail = (tail + 1) & (size - 1);
}
- /* attach the call to the socket */
- read_lock_bh(&local->services_lock);
- if (rx->sk.sk_state == RXRPC_CLOSE)
- goto invalid_service;
+ kfree(b);
+}
- write_lock(&rx->call_lock);
- if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
- rxrpc_get_call(call);
-
- spin_lock(&call->conn->state_lock);
- if (sp->hdr.securityIndex > 0 &&
- call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
- _debug("await conn sec");
- list_add_tail(&call->accept_link, &rx->secureq);
- call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
- set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
- rxrpc_queue_conn(call->conn);
- } else {
- _debug("conn ready");
- call->state = RXRPC_CALL_SERVER_ACCEPTING;
- list_add_tail(&call->accept_link, &rx->acceptq);
- rxrpc_get_call(call);
- atomic_inc(&call->skb_count);
- nsp = rxrpc_skb(notification);
- nsp->call = call;
-
- ASSERTCMP(atomic_read(&call->usage), >=, 3);
-
- _debug("notify");
- spin_lock(&call->lock);
- ret = rxrpc_queue_rcv_skb(call, notification, true,
- false);
- spin_unlock(&call->lock);
- notification = NULL;
- BUG_ON(ret < 0);
+/*
+ * Allocate a new incoming call from the prealloc pool, along with a connection
+ * and a peer as necessary.
+ */
+static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
+ struct rxrpc_local *local,
+ struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_backlog *b = rx->backlog;
+ struct rxrpc_peer *peer, *xpeer;
+ struct rxrpc_call *call;
+ unsigned short call_head, conn_head, peer_head;
+ unsigned short call_tail, conn_tail, peer_tail;
+ unsigned short call_count, conn_count;
+
+ /* #calls >= #conns >= #peers must hold true. */
+ call_head = smp_load_acquire(&b->call_backlog_head);
+ call_tail = b->call_backlog_tail;
+ call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX);
+ conn_head = smp_load_acquire(&b->conn_backlog_head);
+ conn_tail = b->conn_backlog_tail;
+ conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX);
+ ASSERTCMP(conn_count, >=, call_count);
+ peer_head = smp_load_acquire(&b->peer_backlog_head);
+ peer_tail = b->peer_backlog_tail;
+ ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=,
+ conn_count);
+
+ if (call_count == 0)
+ return NULL;
+
+ if (!conn) {
+ /* No connection. We're going to need a peer to start off
+ * with. If one doesn't yet exist, use a spare from the
+ * preallocation set. We dump the address into the spare in
+ * anticipation - and to save on stack space.
+ */
+ xpeer = b->peer_backlog[peer_tail];
+ if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0)
+ return NULL;
+
+ peer = rxrpc_lookup_incoming_peer(local, xpeer);
+ if (peer == xpeer) {
+ b->peer_backlog[peer_tail] = NULL;
+ smp_store_release(&b->peer_backlog_tail,
+ (peer_tail + 1) &
+ (RXRPC_BACKLOG_MAX - 1));
}
- spin_unlock(&call->conn->state_lock);
- _debug("queued");
+ /* Now allocate and set up the connection */
+ conn = b->conn_backlog[conn_tail];
+ b->conn_backlog[conn_tail] = NULL;
+ smp_store_release(&b->conn_backlog_tail,
+ (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
+ rxrpc_get_local(local);
+ conn->params.local = local;
+ conn->params.peer = peer;
+ rxrpc_see_connection(conn);
+ rxrpc_new_incoming_connection(conn, skb);
+ } else {
+ rxrpc_get_connection(conn);
}
- write_unlock(&rx->call_lock);
- _debug("process");
- rxrpc_fast_process_packet(call, skb);
+ /* And now we can allocate and set up a new call */
+ call = b->call_backlog[call_tail];
+ b->call_backlog[call_tail] = NULL;
+ smp_store_release(&b->call_backlog_tail,
+ (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
- _debug("done");
- read_unlock_bh(&local->services_lock);
- rxrpc_free_skb(notification);
- rxrpc_put_call(call);
- _leave(" = 0");
- return 0;
-
-invalid_service:
- _debug("invalid");
- read_unlock_bh(&local->services_lock);
-
- read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
- rxrpc_get_call(call);
- rxrpc_queue_call(call);
- }
- read_unlock_bh(&call->state_lock);
- rxrpc_put_call(call);
- ret = -ECONNREFUSED;
-error:
- rxrpc_free_skb(notification);
-error_nofree:
- _leave(" = %d", ret);
- return ret;
+ rxrpc_see_call(call);
+ call->conn = conn;
+ call->peer = rxrpc_get_peer(conn->params.peer);
+ return call;
}
/*
- * accept incoming calls that need peer, transport and/or connection setting up
- * - the packets we get are all incoming client DATA packets that have seq == 1
+ * Set up a new incoming call. Called in BH context with the RCU read lock
+ * held.
+ *
+ * If this is for a kernel service, when we allocate the call, it will have
+ * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the
+ * retainer ref obtained from the backlog buffer. Prealloc calls for userspace
+ * services only have the ref from the backlog buffer. We want to pass this
+ * ref to non-BH context to dispose of.
+ *
+ * If we want to report an error, we mark the skb with the packet type and
+ * abort code and return NULL.
*/
-void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
+ struct rxrpc_connection *conn,
+ struct sk_buff *skb)
{
- struct rxrpc_skb_priv *sp;
- struct sockaddr_rxrpc srx;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_sock *rx;
- struct rxrpc_wire_header whdr;
- struct sk_buff *skb;
- int ret;
+ struct rxrpc_call *call;
+ u16 service_id = sp->hdr.serviceId;
- _enter("%d", local->debug_id);
+ _enter("");
- skb = skb_dequeue(&local->accept_queue);
- if (!skb) {
- _leave("\n");
- return;
+ /* Get the socket providing the service */
+ rx = rcu_dereference(local->service);
+ if (rx && service_id == rx->srx.srx_service)
+ goto found_service;
+
+ trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_INVALID_OPERATION, EOPNOTSUPP);
+ skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+ skb->priority = RX_INVALID_OPERATION;
+ _leave(" = NULL [service]");
+ return NULL;
+
+found_service:
+ spin_lock(&rx->incoming_lock);
+ if (rx->sk.sk_state == RXRPC_CLOSE) {
+ trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
+ sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
+ skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+ skb->priority = RX_INVALID_OPERATION;
+ _leave(" = NULL [close]");
+ call = NULL;
+ goto out;
}
- _net("incoming call skb %p", skb);
-
- sp = rxrpc_skb(skb);
-
- /* Set up a response packet header in case we need it */
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = 0;
- whdr.flags = 0;
- whdr.type = 0;
- whdr.userStatus = 0;
- whdr.securityIndex = sp->hdr.securityIndex;
- whdr._rsvd = 0;
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
- goto drop;
-
- /* get the socket providing the service */
- read_lock_bh(&local->services_lock);
- list_for_each_entry(rx, &local->services, listen_link) {
- if (rx->srx.srx_service == sp->hdr.serviceId &&
- rx->sk.sk_state != RXRPC_CLOSE)
- goto found_service;
+ call = rxrpc_alloc_incoming_call(rx, local, conn, skb);
+ if (!call) {
+ skb->mark = RXRPC_SKB_MARK_BUSY;
+ _leave(" = NULL [busy]");
+ call = NULL;
+ goto out;
}
- read_unlock_bh(&local->services_lock);
- goto invalid_service;
-found_service:
- _debug("found service %hd", rx->srx.srx_service);
- if (sk_acceptq_is_full(&rx->sk))
- goto backlog_full;
- sk_acceptq_added(&rx->sk);
- sock_hold(&rx->sk);
- read_unlock_bh(&local->services_lock);
-
- ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
- if (ret < 0)
- sk_acceptq_removed(&rx->sk);
- sock_put(&rx->sk);
- switch (ret) {
- case -ECONNRESET: /* old calls are ignored */
- case -ECONNABORTED: /* aborted calls are reaborted or ignored */
- case 0:
- return;
- case -ECONNREFUSED:
- goto invalid_service;
- case -EBUSY:
- goto busy;
- case -EKEYREJECTED:
- goto security_mismatch;
+ trace_rxrpc_receive(call, rxrpc_receive_incoming,
+ sp->hdr.serial, sp->hdr.seq);
+
+ /* Make the call live. */
+ rxrpc_incoming_call(rx, call, skb);
+ conn = call->conn;
+
+ if (rx->notify_new_call)
+ rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+ else
+ sk_acceptq_added(&rx->sk);
+
+ spin_lock(&conn->state_lock);
+ switch (conn->state) {
+ case RXRPC_CONN_SERVICE_UNSECURED:
+ conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
+ set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
+ rxrpc_queue_conn(call->conn);
+ break;
+
+ case RXRPC_CONN_SERVICE:
+ write_lock(&call->state_lock);
+ if (rx->discard_new_call)
+ call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+ else
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ write_unlock(&call->state_lock);
+ break;
+
+ case RXRPC_CONN_REMOTELY_ABORTED:
+ rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+ conn->remote_abort, ECONNABORTED);
+ break;
+ case RXRPC_CONN_LOCALLY_ABORTED:
+ rxrpc_abort_call("CON", call, sp->hdr.seq,
+ conn->local_abort, ECONNABORTED);
+ break;
default:
BUG();
}
+ spin_unlock(&conn->state_lock);
-backlog_full:
- read_unlock_bh(&local->services_lock);
-busy:
- rxrpc_busy(local, &srx, &whdr);
- rxrpc_free_skb(skb);
- return;
+ if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
+ rxrpc_notify_socket(call);
-drop:
- rxrpc_free_skb(skb);
- return;
+ /* We have to discard the prealloc queue's ref here and rely on a
+ * combination of the RCU read lock and refs held either by the socket
+ * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel
+ * service to prevent the call from being deallocated too early.
+ */
+ rxrpc_put_call(call, rxrpc_call_put);
-invalid_service:
- skb->priority = RX_INVALID_OPERATION;
- rxrpc_reject_packet(local, skb);
- return;
-
- /* can't change connection security type mid-flow */
-security_mismatch:
- skb->priority = RX_PROTOCOL_ERROR;
- rxrpc_reject_packet(local, skb);
- return;
+ _leave(" = %p{%d}", call, call->debug_id);
+out:
+ spin_unlock(&rx->incoming_lock);
+ return call;
}
/*
@@ -286,7 +430,8 @@ security_mismatch:
* - assign the user call ID to the call at the front of the queue
*/
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
- unsigned long user_call_ID)
+ unsigned long user_call_ID,
+ rxrpc_notify_rx_t notify_rx)
{
struct rxrpc_call *call;
struct rb_node *parent, **pp;
@@ -298,12 +443,13 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
write_lock(&rx->call_lock);
- ret = -ENODATA;
- if (list_empty(&rx->acceptq))
- goto out;
+ if (list_empty(&rx->to_be_accepted)) {
+ write_unlock(&rx->call_lock);
+ kleave(" = -ENODATA [empty]");
+ return ERR_PTR(-ENODATA);
+ }
/* check the user ID isn't already in use */
- ret = -EBADSLT;
pp = &rx->calls.rb_node;
parent = NULL;
while (*pp) {
@@ -315,62 +461,59 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
else if (user_call_ID > call->user_call_ID)
pp = &(*pp)->rb_right;
else
- goto out;
+ goto id_in_use;
}
- /* dequeue the first call and check it's still valid */
- call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ /* Dequeue the first call and check it's still valid. We gain
+ * responsibility for the queue's reference.
+ */
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
list_del_init(&call->accept_link);
sk_acceptq_removed(&rx->sk);
+ rxrpc_see_call(call);
write_lock_bh(&call->state_lock);
switch (call->state) {
case RXRPC_CALL_SERVER_ACCEPTING:
call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
break;
- case RXRPC_CALL_REMOTELY_ABORTED:
- case RXRPC_CALL_LOCALLY_ABORTED:
- ret = -ECONNABORTED;
- goto out_release;
- case RXRPC_CALL_NETWORK_ERROR:
- ret = call->conn->error;
+ case RXRPC_CALL_COMPLETE:
+ ret = call->error;
goto out_release;
- case RXRPC_CALL_DEAD:
- ret = -ETIME;
- goto out_discard;
default:
BUG();
}
/* formalise the acceptance */
+ call->notify_rx = notify_rx;
call->user_call_ID = user_call_ID;
+ rxrpc_get_call(call, rxrpc_call_got_userid);
rb_link_node(&call->sock_node, parent, pp);
rb_insert_color(&call->sock_node, &rx->calls);
if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
BUG();
- if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
- BUG();
- rxrpc_queue_call(call);
- rxrpc_get_call(call);
write_unlock_bh(&call->state_lock);
write_unlock(&rx->call_lock);
+ rxrpc_notify_socket(call);
+ rxrpc_service_prealloc(rx, GFP_KERNEL);
_leave(" = %p{%d}", call, call->debug_id);
return call;
- /* if the call is already dying or dead, then we leave the socket's ref
- * on it to be released by rxrpc_dead_call_expired() as induced by
- * rxrpc_release_call() */
out_release:
_debug("release %p", call);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
-out_discard:
write_unlock_bh(&call->state_lock);
- _debug("discard %p", call);
-out:
write_unlock(&rx->call_lock);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ goto out;
+
+id_in_use:
+ ret = -EBADSLT;
+ write_unlock(&rx->call_lock);
+out:
+ rxrpc_service_prealloc(rx, GFP_KERNEL);
_leave(" = %d", ret);
return ERR_PTR(ret);
}
@@ -382,6 +525,7 @@ out:
int rxrpc_reject_call(struct rxrpc_sock *rx)
{
struct rxrpc_call *call;
+ bool abort = false;
int ret;
_enter("");
@@ -390,88 +534,73 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
write_lock(&rx->call_lock);
- ret = -ENODATA;
- if (list_empty(&rx->acceptq))
- goto out;
+ if (list_empty(&rx->to_be_accepted)) {
+ write_unlock(&rx->call_lock);
+ return -ENODATA;
+ }
- /* dequeue the first call and check it's still valid */
- call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ /* Dequeue the first call and check it's still valid. We gain
+ * responsibility for the queue's reference.
+ */
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
list_del_init(&call->accept_link);
sk_acceptq_removed(&rx->sk);
+ rxrpc_see_call(call);
write_lock_bh(&call->state_lock);
switch (call->state) {
case RXRPC_CALL_SERVER_ACCEPTING:
- call->state = RXRPC_CALL_SERVER_BUSY;
- if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
- rxrpc_queue_call(call);
- ret = 0;
- goto out_release;
- case RXRPC_CALL_REMOTELY_ABORTED:
- case RXRPC_CALL_LOCALLY_ABORTED:
- ret = -ECONNABORTED;
- goto out_release;
- case RXRPC_CALL_NETWORK_ERROR:
- ret = call->conn->error;
- goto out_release;
- case RXRPC_CALL_DEAD:
- ret = -ETIME;
+ __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED);
+ abort = true;
+ /* fall through */
+ case RXRPC_CALL_COMPLETE:
+ ret = call->error;
goto out_discard;
default:
BUG();
}
- /* if the call is already dying or dead, then we leave the socket's ref
- * on it to be released by rxrpc_dead_call_expired() as induced by
- * rxrpc_release_call() */
-out_release:
- _debug("release %p", call);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
out_discard:
write_unlock_bh(&call->state_lock);
- _debug("discard %p", call);
-out:
write_unlock(&rx->call_lock);
+ if (abort) {
+ rxrpc_send_abort_packet(call);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ }
+ rxrpc_service_prealloc(rx, GFP_KERNEL);
_leave(" = %d", ret);
return ret;
}
-/**
- * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
- * @sock: The socket on which the impending call is waiting
- * @user_call_ID: The tag to attach to the call
+/*
+ * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
+ * @sock: The socket on which to preallocate
+ * @notify_rx: Event notification function for the call
+ * @user_attach_call: Func to attach call to user_call_ID
+ * @user_call_ID: The tag to attach to the preallocated call
+ * @gfp: The allocation conditions.
*
- * Allow a kernel service to accept an incoming call, assuming the incoming
- * call is still valid.
- */
-struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
- unsigned long user_call_ID)
-{
- struct rxrpc_call *call;
-
- _enter(",%lx", user_call_ID);
- call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID);
- _leave(" = %p", call);
- return call;
-}
-EXPORT_SYMBOL(rxrpc_kernel_accept_call);
-
-/**
- * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
- * @sock: The socket on which the impending call is waiting
+ * Charge up the socket with preallocated calls, each with a user ID. A
+ * function should be provided to effect the attachment from the user's side.
+ * The user is given a ref to hold on the call.
*
- * Allow a kernel service to reject an incoming call with a BUSY message,
- * assuming the incoming call is still valid.
+ * Note that the call may be come connected before this function returns.
*/
-int rxrpc_kernel_reject_call(struct socket *sock)
+int rxrpc_kernel_charge_accept(struct socket *sock,
+ rxrpc_notify_rx_t notify_rx,
+ rxrpc_user_attach_call_t user_attach_call,
+ unsigned long user_call_ID, gfp_t gfp)
{
- int ret;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct rxrpc_backlog *b = rx->backlog;
- _enter("");
- ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
- _leave(" = %d", ret);
- return ret;
+ if (sock->sk->sk_state == RXRPC_CLOSE)
+ return -ESHUTDOWN;
+
+ return rxrpc_service_prealloc_one(rx, b, notify_rx,
+ user_attach_call, user_call_ID,
+ gfp);
}
-EXPORT_SYMBOL(rxrpc_kernel_reject_call);
+EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index e60cf65c2232..97a17ada4431 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -22,1281 +22,402 @@
#include "ar-internal.h"
/*
- * propose an ACK be sent
+ * Set the timer
*/
-void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u32 serial, bool immediate)
+void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+ ktime_t now)
{
- unsigned long expiry;
- s8 prior = rxrpc_ack_priority[ack_reason];
-
- ASSERTCMP(prior, >, 0);
-
- _enter("{%d},%s,%%%x,%u",
- call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
+ unsigned long t_j, now_j = jiffies;
+ ktime_t t;
+ bool queue = false;
+
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ t = call->expire_at;
+ if (!ktime_after(t, now)) {
+ trace_rxrpc_timer(call, why, now, now_j);
+ queue = true;
+ goto out;
+ }
- if (prior < rxrpc_ack_priority[call->ackr_reason]) {
- if (immediate)
- goto cancel_timer;
- return;
- }
+ if (!ktime_after(call->resend_at, now)) {
+ call->resend_at = call->expire_at;
+ if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ queue = true;
+ } else if (ktime_before(call->resend_at, t)) {
+ t = call->resend_at;
+ }
- /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
- * numbers */
- if (prior == rxrpc_ack_priority[call->ackr_reason]) {
- if (prior <= 4)
- call->ackr_serial = serial;
- if (immediate)
- goto cancel_timer;
- return;
- }
+ if (!ktime_after(call->ack_at, now)) {
+ call->ack_at = call->expire_at;
+ if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
+ queue = true;
+ } else if (ktime_before(call->ack_at, t)) {
+ t = call->ack_at;
+ }
- call->ackr_reason = ack_reason;
- call->ackr_serial = serial;
+ if (!ktime_after(call->ping_at, now)) {
+ call->ping_at = call->expire_at;
+ if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
+ queue = true;
+ } else if (ktime_before(call->ping_at, t)) {
+ t = call->ping_at;
+ }
- switch (ack_reason) {
- case RXRPC_ACK_DELAY:
- _debug("run delay timer");
- expiry = rxrpc_soft_ack_delay;
- goto run_timer;
+ t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
+ t_j += jiffies;
- case RXRPC_ACK_IDLE:
- if (!immediate) {
- _debug("run defer timer");
- expiry = rxrpc_idle_ack_delay;
- goto run_timer;
- }
- goto cancel_timer;
+ /* We have to make sure that the calculated jiffies value falls
+ * at or after the nsec value, or we may loop ceaselessly
+ * because the timer times out, but we haven't reached the nsec
+ * timeout yet.
+ */
+ t_j++;
- case RXRPC_ACK_REQUESTED:
- expiry = rxrpc_requested_ack_delay;
- if (!expiry)
- goto cancel_timer;
- if (!immediate || serial == 1) {
- _debug("run defer timer");
- goto run_timer;
+ if (call->timer.expires != t_j || !timer_pending(&call->timer)) {
+ mod_timer(&call->timer, t_j);
+ trace_rxrpc_timer(call, why, now, now_j);
}
-
- default:
- _debug("immediate ACK");
- goto cancel_timer;
}
-run_timer:
- expiry += jiffies;
- if (!timer_pending(&call->ack_timer) ||
- time_after(call->ack_timer.expires, expiry))
- mod_timer(&call->ack_timer, expiry);
- return;
-
-cancel_timer:
- _debug("cancel timer %%%u", serial);
- try_to_del_timer_sync(&call->ack_timer);
- read_lock_bh(&call->state_lock);
- if (call->state <= RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
+out:
+ if (queue)
rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
-}
-
-/*
- * propose an ACK be sent, locking the call structure
- */
-void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u32 serial, bool immediate)
-{
- s8 prior = rxrpc_ack_priority[ack_reason];
-
- if (prior > rxrpc_ack_priority[call->ackr_reason]) {
- spin_lock_bh(&call->lock);
- __rxrpc_propose_ACK(call, ack_reason, serial, immediate);
- spin_unlock_bh(&call->lock);
- }
}
/*
- * set the resend timer
+ * Set the timer
*/
-static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
- unsigned long resend_at)
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+ ktime_t now)
{
read_lock_bh(&call->state_lock);
- if (call->state >= RXRPC_CALL_COMPLETE)
- resend = 0;
-
- if (resend & 1) {
- _debug("SET RESEND");
- set_bit(RXRPC_CALL_EV_RESEND, &call->events);
- }
-
- if (resend & 2) {
- _debug("MODIFY RESEND TIMER");
- set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- mod_timer(&call->resend_timer, resend_at);
- } else {
- _debug("KILL RESEND TIMER");
- del_timer_sync(&call->resend_timer);
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- }
+ __rxrpc_set_timer(call, why, now);
read_unlock_bh(&call->state_lock);
}
/*
- * resend packets
+ * Propose a PING ACK be sent.
*/
-static void rxrpc_resend(struct rxrpc_call *call)
+static void rxrpc_propose_ping(struct rxrpc_call *call,
+ bool immediate, bool background)
{
- struct rxrpc_wire_header *whdr;
- struct rxrpc_skb_priv *sp;
- struct sk_buff *txb;
- unsigned long *p_txb, resend_at;
- bool stop;
- int loop;
- u8 resend;
-
- _enter("{%d,%d,%d,%d},",
- call->acks_hard, call->acks_unacked,
- atomic_read(&call->sequence),
- CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
- stop = false;
- resend = 0;
- resend_at = 0;
-
- for (loop = call->acks_tail;
- loop != call->acks_head || stop;
- loop = (loop + 1) & (call->acks_winsz - 1)
- ) {
- p_txb = call->acks_window + loop;
- smp_read_barrier_depends();
- if (*p_txb & 1)
- continue;
-
- txb = (struct sk_buff *) *p_txb;
- sp = rxrpc_skb(txb);
-
- if (sp->need_resend) {
- sp->need_resend = false;
-
- /* each Tx packet has a new serial number */
- sp->hdr.serial = atomic_inc_return(&call->conn->serial);
-
- whdr = (struct rxrpc_wire_header *)txb->head;
- whdr->serial = htonl(sp->hdr.serial);
-
- _proto("Tx DATA %%%u { #%d }",
- sp->hdr.serial, sp->hdr.seq);
- if (rxrpc_send_data_packet(call->conn, txb) < 0) {
- stop = true;
- sp->resend_at = jiffies + 3;
- } else {
- sp->resend_at =
- jiffies + rxrpc_resend_timeout;
- }
- }
-
- if (time_after_eq(jiffies + 1, sp->resend_at)) {
- sp->need_resend = true;
- resend |= 1;
- } else if (resend & 2) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
- } else {
- resend_at = sp->resend_at;
- resend |= 2;
- }
- }
-
- rxrpc_set_resend(call, resend, resend_at);
- _leave("");
-}
-
-/*
- * handle resend timer expiry
- */
-static void rxrpc_resend_timer(struct rxrpc_call *call)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *txb;
- unsigned long *p_txb, resend_at;
- int loop;
- u8 resend;
-
- _enter("%d,%d,%d",
- call->acks_tail, call->acks_unacked, call->acks_head);
-
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- resend = 0;
- resend_at = 0;
-
- for (loop = call->acks_unacked;
- loop != call->acks_head;
- loop = (loop + 1) & (call->acks_winsz - 1)
- ) {
- p_txb = call->acks_window + loop;
- smp_read_barrier_depends();
- txb = (struct sk_buff *) (*p_txb & ~1);
- sp = rxrpc_skb(txb);
-
- ASSERT(!(*p_txb & 1));
+ if (immediate) {
+ if (background &&
+ !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
+ rxrpc_queue_call(call);
+ } else {
+ ktime_t now = ktime_get_real();
+ ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay);
- if (sp->need_resend) {
- ;
- } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
- sp->need_resend = true;
- resend |= 1;
- } else if (resend & 2) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
- } else {
- resend_at = sp->resend_at;
- resend |= 2;
+ if (ktime_before(ping_at, call->ping_at)) {
+ call->ping_at = ping_at;
+ rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now);
}
}
-
- rxrpc_set_resend(call, resend, resend_at);
- _leave("");
}
/*
- * process soft ACKs of our transmitted packets
- * - these indicate packets the peer has or has not received, but hasn't yet
- * given to the consumer, and so can still be discarded and re-requested
+ * propose an ACK be sent
*/
-static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
- struct rxrpc_ackpacket *ack,
- struct sk_buff *skb)
+static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ u16 skew, u32 serial, bool immediate,
+ bool background,
+ enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *txb;
- unsigned long *p_txb, resend_at;
- int loop;
- u8 sacks[RXRPC_MAXACKS], resend;
-
- _enter("{%d,%d},{%d},",
- call->acks_hard,
- CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
- ack->nAcks);
-
- if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
- goto protocol_error;
-
- resend = 0;
- resend_at = 0;
- for (loop = 0; loop < ack->nAcks; loop++) {
- p_txb = call->acks_window;
- p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
- smp_read_barrier_depends();
- txb = (struct sk_buff *) (*p_txb & ~1);
- sp = rxrpc_skb(txb);
-
- switch (sacks[loop]) {
- case RXRPC_ACK_TYPE_ACK:
- sp->need_resend = false;
- *p_txb |= 1;
- break;
- case RXRPC_ACK_TYPE_NACK:
- sp->need_resend = true;
- *p_txb &= ~1;
- resend = 1;
- break;
- default:
- _debug("Unsupported ACK type %d", sacks[loop]);
- goto protocol_error;
- }
- }
-
- smp_mb();
- call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
-
- /* anything not explicitly ACK'd is implicitly NACK'd, but may just not
- * have been received or processed yet by the far end */
- for (loop = call->acks_unacked;
- loop != call->acks_head;
- loop = (loop + 1) & (call->acks_winsz - 1)
- ) {
- p_txb = call->acks_window + loop;
- smp_read_barrier_depends();
- txb = (struct sk_buff *) (*p_txb & ~1);
- sp = rxrpc_skb(txb);
+ enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
+ unsigned int expiry = rxrpc_soft_ack_delay;
+ ktime_t now, ack_at;
+ s8 prior = rxrpc_ack_priority[ack_reason];
- if (*p_txb & 1) {
- /* packet must have been discarded */
- sp->need_resend = true;
- *p_txb &= ~1;
- resend |= 1;
- } else if (sp->need_resend) {
- ;
- } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
- sp->need_resend = true;
- resend |= 1;
- } else if (resend & 2) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
- } else {
- resend_at = sp->resend_at;
- resend |= 2;
+ /* Pings are handled specially because we don't want to accidentally
+ * lose a ping response by subsuming it into a ping.
+ */
+ if (ack_reason == RXRPC_ACK_PING) {
+ rxrpc_propose_ping(call, immediate, background);
+ goto trace;
+ }
+
+ /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+ * numbers, but we don't alter the timeout.
+ */
+ _debug("prior %u %u vs %u %u",
+ ack_reason, prior,
+ call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
+ if (ack_reason == call->ackr_reason) {
+ if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+ outcome = rxrpc_propose_ack_update;
+ call->ackr_serial = serial;
+ call->ackr_skew = skew;
}
+ if (!immediate)
+ goto trace;
+ } else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+ call->ackr_reason = ack_reason;
+ call->ackr_serial = serial;
+ call->ackr_skew = skew;
+ } else {
+ outcome = rxrpc_propose_ack_subsume;
}
- rxrpc_set_resend(call, resend, resend_at);
- _leave(" = 0");
- return 0;
-
-protocol_error:
- _leave(" = -EPROTO");
- return -EPROTO;
-}
-
-/*
- * discard hard-ACK'd packets from the Tx window
- */
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
-{
- unsigned long _skb;
- int tail = call->acks_tail, old_tail;
- int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
+ switch (ack_reason) {
+ case RXRPC_ACK_REQUESTED:
+ if (rxrpc_requested_ack_delay < expiry)
+ expiry = rxrpc_requested_ack_delay;
+ if (serial == 1)
+ immediate = false;
+ break;
- _enter("{%u,%u},%u", call->acks_hard, win, hard);
+ case RXRPC_ACK_DELAY:
+ if (rxrpc_soft_ack_delay < expiry)
+ expiry = rxrpc_soft_ack_delay;
+ break;
- ASSERTCMP(hard - call->acks_hard, <=, win);
+ case RXRPC_ACK_IDLE:
+ if (rxrpc_idle_ack_delay < expiry)
+ expiry = rxrpc_idle_ack_delay;
+ break;
- while (call->acks_hard < hard) {
- smp_read_barrier_depends();
- _skb = call->acks_window[tail] & ~1;
- rxrpc_free_skb((struct sk_buff *) _skb);
- old_tail = tail;
- tail = (tail + 1) & (call->acks_winsz - 1);
- call->acks_tail = tail;
- if (call->acks_unacked == old_tail)
- call->acks_unacked = tail;
- call->acks_hard++;
+ default:
+ immediate = true;
+ break;
}
- wake_up(&call->tx_waitq);
-}
-
-/*
- * clear the Tx window in the event of a failure
- */
-static void rxrpc_clear_tx_window(struct rxrpc_call *call)
-{
- rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
-}
-
-/*
- * drain the out of sequence received packet queue into the packet Rx queue
- */
-static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- bool terminal;
- int ret;
-
- _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
-
- spin_lock_bh(&call->lock);
-
- ret = -ECONNRESET;
- if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
- goto socket_unavailable;
-
- skb = skb_dequeue(&call->rx_oos_queue);
- if (skb) {
- sp = rxrpc_skb(skb);
-
- _debug("drain OOS packet %d [%d]",
- sp->hdr.seq, call->rx_first_oos);
-
- if (sp->hdr.seq != call->rx_first_oos) {
- skb_queue_head(&call->rx_oos_queue, skb);
- call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
- _debug("requeue %p {%u}", skb, call->rx_first_oos);
- } else {
- skb->mark = RXRPC_SKB_MARK_DATA;
- terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
- !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
- ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
- BUG_ON(ret < 0);
- _debug("drain #%u", call->rx_data_post);
- call->rx_data_post++;
-
- /* find out what the next packet is */
- skb = skb_peek(&call->rx_oos_queue);
- if (skb)
- call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
- else
- call->rx_first_oos = 0;
- _debug("peek %p {%u}", skb, call->rx_first_oos);
+ if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+ _debug("already scheduled");
+ } else if (immediate || expiry == 0) {
+ _debug("immediate ACK %lx", call->events);
+ if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) &&
+ background)
+ rxrpc_queue_call(call);
+ } else {
+ now = ktime_get_real();
+ ack_at = ktime_add_ms(now, expiry);
+ if (ktime_before(ack_at, call->ack_at)) {
+ call->ack_at = ack_at;
+ rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now);
}
}
- ret = 0;
-socket_unavailable:
- spin_unlock_bh(&call->lock);
- _leave(" = %d", ret);
- return ret;
+trace:
+ trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
+ background, outcome);
}
/*
- * insert an out of sequence packet into the buffer
+ * propose an ACK be sent, locking the call structure
*/
-static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
- struct sk_buff *skb)
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ u16 skew, u32 serial, bool immediate, bool background,
+ enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_skb_priv *sp, *psp;
- struct sk_buff *p;
- u32 seq;
-
- sp = rxrpc_skb(skb);
- seq = sp->hdr.seq;
- _enter(",,{%u}", seq);
-
- skb->destructor = rxrpc_packet_destructor;
- ASSERTCMP(sp->call, ==, NULL);
- sp->call = call;
- rxrpc_get_call(call);
- atomic_inc(&call->skb_count);
-
- /* insert into the buffer in sequence order */
spin_lock_bh(&call->lock);
-
- skb_queue_walk(&call->rx_oos_queue, p) {
- psp = rxrpc_skb(p);
- if (psp->hdr.seq > seq) {
- _debug("insert oos #%u before #%u", seq, psp->hdr.seq);
- skb_insert(p, skb, &call->rx_oos_queue);
- goto inserted;
- }
- }
-
- _debug("append oos #%u", seq);
- skb_queue_tail(&call->rx_oos_queue, skb);
-inserted:
-
- /* we might now have a new front to the queue */
- if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
- call->rx_first_oos = seq;
-
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- call->rx_data_post == call->rx_first_oos) {
- _debug("drain rx oos now");
- set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
- }
- read_unlock(&call->state_lock);
-
+ __rxrpc_propose_ACK(call, ack_reason, skew, serial,
+ immediate, background, why);
spin_unlock_bh(&call->lock);
- _leave(" [stored #%u]", call->rx_first_oos);
}
/*
- * clear the Tx window on final ACK reception
+ * Handle congestion being detected by the retransmit timeout.
*/
-static void rxrpc_zap_tx_window(struct rxrpc_call *call)
+static void rxrpc_congestion_timeout(struct rxrpc_call *call)
{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- unsigned long _skb, *acks_window;
- u8 winsz = call->acks_winsz;
- int tail;
-
- acks_window = call->acks_window;
- call->acks_window = NULL;
-
- while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
- tail = call->acks_tail;
- smp_read_barrier_depends();
- _skb = acks_window[tail] & ~1;
- smp_mb();
- call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
-
- skb = (struct sk_buff *) _skb;
- sp = rxrpc_skb(skb);
- _debug("+++ clear Tx %u", sp->hdr.seq);
- rxrpc_free_skb(skb);
- }
-
- kfree(acks_window);
-}
-
-/*
- * process the extra information that may be appended to an ACK packet
- */
-static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int latest, int nAcks)
-{
- struct rxrpc_ackinfo ackinfo;
- struct rxrpc_peer *peer;
- unsigned int mtu;
-
- if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
- _leave(" [no ackinfo]");
- return;
- }
-
- _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
- latest,
- ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
- ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
-
- mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
-
- peer = call->conn->params.peer;
- if (mtu < peer->maxdata) {
- spin_lock_bh(&peer->lock);
- peer->maxdata = mtu;
- peer->mtu = mtu + peer->hdrsize;
- spin_unlock_bh(&peer->lock);
- _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
- }
+ set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
}
/*
- * process packets in the reception queue
+ * Perform retransmission of NAK'd and unack'd packets.
*/
-static int rxrpc_process_rx_queue(struct rxrpc_call *call,
- u32 *_abort_code)
+static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
{
- struct rxrpc_ackpacket ack;
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
- bool post_ACK;
- int latest;
- u32 hard, tx;
+ rxrpc_seq_t cursor, seq, top;
+ ktime_t max_age, oldest, ack_ts;
+ int ix;
+ u8 annotation, anno_type, retrans = 0, unacked = 0;
- _enter("");
+ _enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
-process_further:
- skb = skb_dequeue(&call->rx_queue);
- if (!skb)
- return -EAGAIN;
+ max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
- _net("deferred skb %p", skb);
-
- sp = rxrpc_skb(skb);
-
- _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
-
- post_ACK = false;
-
- switch (sp->hdr.type) {
- /* data packets that wind up here have been received out of
- * order, need security processing or are jumbo packets */
- case RXRPC_PACKET_TYPE_DATA:
- _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
- /* secured packets must be verified and possibly decrypted */
- if (call->conn->security->verify_packet(call, skb,
- _abort_code) < 0)
- goto protocol_error;
-
- rxrpc_insert_oos_packet(call, skb);
- goto process_further;
-
- /* partial ACK to process */
- case RXRPC_PACKET_TYPE_ACK:
- if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
- _debug("extraction failure");
- goto protocol_error;
- }
- if (!skb_pull(skb, sizeof(ack)))
- BUG();
-
- latest = sp->hdr.serial;
- hard = ntohl(ack.firstPacket);
- tx = atomic_read(&call->sequence);
-
- _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- latest,
- ntohs(ack.maxSkew),
- hard,
- ntohl(ack.previousPacket),
- ntohl(ack.serial),
- rxrpc_acks(ack.reason),
- ack.nAcks);
-
- rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
-
- if (ack.reason == RXRPC_ACK_PING) {
- _proto("Rx ACK %%%u PING Request", latest);
- rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
- sp->hdr.serial, true);
- }
-
- /* discard any out-of-order or duplicate ACKs */
- if (latest - call->acks_latest <= 0) {
- _debug("discard ACK %d <= %d",
- latest, call->acks_latest);
- goto discard;
- }
- call->acks_latest = latest;
-
- if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
- call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
- call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
- call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
- goto discard;
-
- _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
-
- if (hard > 0) {
- if (hard - 1 > tx) {
- _debug("hard-ACK'd packet %d not transmitted"
- " (%d top)",
- hard - 1, tx);
- goto protocol_error;
- }
+ spin_lock_bh(&call->lock);
- if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
- call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
- hard > tx) {
- call->acks_hard = tx;
- goto all_acked;
- }
+ cursor = call->tx_hard_ack;
+ top = call->tx_top;
+ ASSERT(before_eq(cursor, top));
+ if (cursor == top)
+ goto out_unlock;
+
+ /* Scan the packet list without dropping the lock and decide which of
+ * the packets in the Tx buffer we're going to resend and what the new
+ * resend timeout will be.
+ */
+ oldest = now;
+ for (seq = cursor + 1; before_eq(seq, top); seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ if (anno_type == RXRPC_TX_ANNO_ACK)
+ continue;
- smp_rmb();
- rxrpc_rotate_tx_window(call, hard - 1);
- }
+ skb = call->rxtx_buffer[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
+ sp = rxrpc_skb(skb);
- if (ack.nAcks > 0) {
- if (hard - 1 + ack.nAcks > tx) {
- _debug("soft-ACK'd packet %d+%d not"
- " transmitted (%d top)",
- hard - 1, ack.nAcks, tx);
- goto protocol_error;
+ if (anno_type == RXRPC_TX_ANNO_UNACK) {
+ if (ktime_after(skb->tstamp, max_age)) {
+ if (ktime_before(skb->tstamp, oldest))
+ oldest = skb->tstamp;
+ continue;
}
-
- if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
- goto protocol_error;
+ if (!(annotation & RXRPC_TX_ANNO_RESENT))
+ unacked++;
}
- goto discard;
-
- /* complete ACK to process */
- case RXRPC_PACKET_TYPE_ACKALL:
- goto all_acked;
-
- /* abort and busy are handled elsewhere */
- case RXRPC_PACKET_TYPE_BUSY:
- case RXRPC_PACKET_TYPE_ABORT:
- BUG();
-
- /* connection level events - also handled elsewhere */
- case RXRPC_PACKET_TYPE_CHALLENGE:
- case RXRPC_PACKET_TYPE_RESPONSE:
- case RXRPC_PACKET_TYPE_DEBUG:
- BUG();
- }
-
- /* if we've had a hard ACK that covers all the packets we've sent, then
- * that ends that phase of the operation */
-all_acked:
- write_lock_bh(&call->state_lock);
- _debug("ack all %d", call->state);
- switch (call->state) {
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
- break;
- case RXRPC_CALL_SERVER_AWAIT_ACK:
- _debug("srv complete");
- call->state = RXRPC_CALL_COMPLETE;
- post_ACK = true;
- break;
- case RXRPC_CALL_CLIENT_SEND_REQUEST:
- case RXRPC_CALL_SERVER_RECV_REQUEST:
- goto protocol_error_unlock; /* can't occur yet */
- default:
- write_unlock_bh(&call->state_lock);
- goto discard; /* assume packet left over from earlier phase */
+ /* Okay, we need to retransmit a packet. */
+ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+ retrans++;
+ trace_rxrpc_retransmit(call, seq, annotation | anno_type,
+ ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
}
- write_unlock_bh(&call->state_lock);
-
- /* if all the packets we sent are hard-ACK'd, then we can discard
- * whatever we've got left */
- _debug("clear Tx %d",
- CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
- del_timer_sync(&call->resend_timer);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+ call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
- if (call->acks_window)
- rxrpc_zap_tx_window(call);
+ if (unacked)
+ rxrpc_congestion_timeout(call);
- if (post_ACK) {
- /* post the final ACK message for userspace to pick up */
- _debug("post ACK");
- skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
- sp->call = call;
- rxrpc_get_call(call);
- atomic_inc(&call->skb_count);
- spin_lock_bh(&call->lock);
- if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
- BUG();
+ /* If there was nothing that needed retransmission then it's likely
+ * that an ACK got lost somewhere. Send a ping to find out instead of
+ * retransmitting data.
+ */
+ if (!retrans) {
+ rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
spin_unlock_bh(&call->lock);
- goto process_further;
- }
-
-discard:
- rxrpc_free_skb(skb);
- goto process_further;
-
-protocol_error_unlock:
- write_unlock_bh(&call->state_lock);
-protocol_error:
- rxrpc_free_skb(skb);
- _leave(" = -EPROTO");
- return -EPROTO;
-}
-
-/*
- * post a message to the socket Rx queue for recvmsg() to pick up
- */
-static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
- bool fatal)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- int ret;
-
- _enter("{%d,%lx},%u,%u,%d",
- call->debug_id, call->flags, mark, error, fatal);
-
- /* remove timers and things for fatal messages */
- if (fatal) {
- del_timer_sync(&call->resend_timer);
- del_timer_sync(&call->ack_timer);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- }
+ ack_ts = ktime_sub(now, call->acks_latest_ts);
+ if (ktime_to_ns(ack_ts) < call->peer->rtt)
+ goto out;
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+ rxrpc_propose_ack_ping_for_lost_ack);
+ rxrpc_send_ack_packet(call, true);
+ goto out;
+ }
+
+ /* Now go through the Tx window and perform the retransmissions. We
+ * have to drop the lock for each send. If an ACK comes in whilst the
+ * lock is dropped, it may clear some of the retransmission markers for
+ * packets that it soft-ACKs.
+ */
+ for (seq = cursor + 1; before_eq(seq, top); seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ if (anno_type != RXRPC_TX_ANNO_RETRANS)
+ continue;
- if (mark != RXRPC_SKB_MARK_NEW_CALL &&
- !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- _leave("[no userid]");
- return 0;
- }
+ skb = call->rxtx_buffer[ix];
+ rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ spin_unlock_bh(&call->lock);
- if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
- skb = alloc_skb(0, GFP_NOFS);
- if (!skb)
- return -ENOMEM;
+ if (rxrpc_send_data_packet(call, skb, true) < 0) {
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ return;
+ }
- rxrpc_new_skb(skb);
+ if (rxrpc_is_client_call(call))
+ rxrpc_expose_client_call(call);
- skb->mark = mark;
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ spin_lock_bh(&call->lock);
- sp = rxrpc_skb(skb);
- memset(sp, 0, sizeof(*sp));
- sp->error = error;
- sp->call = call;
- rxrpc_get_call(call);
- atomic_inc(&call->skb_count);
+ /* We need to clear the retransmit state, but there are two
+ * things we need to be aware of: A new ACK/NAK might have been
+ * received and the packet might have been hard-ACK'd (in which
+ * case it will no longer be in the buffer).
+ */
+ if (after(seq, call->tx_hard_ack)) {
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ if (anno_type == RXRPC_TX_ANNO_RETRANS ||
+ anno_type == RXRPC_TX_ANNO_NAK) {
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ annotation |= RXRPC_TX_ANNO_UNACK;
+ }
+ annotation |= RXRPC_TX_ANNO_RESENT;
+ call->rxtx_annotations[ix] = annotation;
+ }
- spin_lock_bh(&call->lock);
- ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
- spin_unlock_bh(&call->lock);
- BUG_ON(ret < 0);
+ if (after(call->tx_hard_ack, seq))
+ seq = call->tx_hard_ack;
}
- return 0;
+out_unlock:
+ spin_unlock_bh(&call->lock);
+out:
+ _leave("");
}
/*
- * handle background processing of incoming call packets and ACK / abort
- * generation
+ * Handle retransmission and deferred ACK/abort generation.
*/
void rxrpc_process_call(struct work_struct *work)
{
struct rxrpc_call *call =
container_of(work, struct rxrpc_call, processor);
- struct rxrpc_wire_header whdr;
- struct rxrpc_ackpacket ack;
- struct rxrpc_ackinfo ackinfo;
- struct msghdr msg;
- struct kvec iov[5];
- enum rxrpc_call_event genbit;
- unsigned long bits;
- __be32 data, pad;
- size_t len;
- int loop, nbit, ioc, ret, mtu;
- u32 serial, abort_code = RX_PROTOCOL_ERROR;
- u8 *acks = NULL;
-
- //printk("\n--------------------\n");
- _enter("{%d,%s,%lx} [%lu]",
- call->debug_id, rxrpc_call_states[call->state], call->events,
- (jiffies - call->creation_jif) / (HZ / 10));
-
- if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) {
- _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX");
- return;
- }
-
- if (!call->conn)
- goto skip_msg_init;
-
- /* there's a good chance we're going to have to send a message, so set
- * one up in advance */
- msg.msg_name = &call->conn->params.peer->srx.transport;
- msg.msg_namelen = call->conn->params.peer->srx.transport_len;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
-
- whdr.epoch = htonl(call->conn->proto.epoch);
- whdr.cid = htonl(call->cid);
- whdr.callNumber = htonl(call->call_id);
- whdr.seq = 0;
- whdr.type = RXRPC_PACKET_TYPE_ACK;
- whdr.flags = call->conn->out_clientflag;
- whdr.userStatus = 0;
- whdr.securityIndex = call->conn->security_ix;
- whdr._rsvd = 0;
- whdr.serviceId = htons(call->service_id);
-
- memset(iov, 0, sizeof(iov));
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
-skip_msg_init:
-
- /* deal with events of a final nature */
- if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
- enum rxrpc_skb_mark mark;
- int error;
+ ktime_t now;
- clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
- clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
- clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
- error = call->error_report;
- if (error < RXRPC_LOCAL_ERROR_OFFSET) {
- mark = RXRPC_SKB_MARK_NET_ERROR;
- _debug("post net error %d", error);
- } else {
- mark = RXRPC_SKB_MARK_LOCAL_ERROR;
- error -= RXRPC_LOCAL_ERROR_OFFSET;
- _debug("post net local error %d", error);
- }
-
- if (rxrpc_post_message(call, mark, error, true) < 0)
- goto no_mem;
- clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
- goto kill_ACKs;
- }
-
- if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
- ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
-
- clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
- clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
- _debug("post conn abort");
-
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
- call->conn->error, true) < 0)
- goto no_mem;
- clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
- goto kill_ACKs;
- }
+ rxrpc_see_call(call);
- if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
- whdr.type = RXRPC_PACKET_TYPE_BUSY;
- genbit = RXRPC_CALL_EV_REJECT_BUSY;
- goto send_message;
- }
-
- if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
- ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
-
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
- ECONNABORTED, true) < 0)
- goto no_mem;
- whdr.type = RXRPC_PACKET_TYPE_ABORT;
- data = htonl(call->local_abort);
- iov[1].iov_base = &data;
- iov[1].iov_len = sizeof(data);
- genbit = RXRPC_CALL_EV_ABORT;
- goto send_message;
- }
-
- if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
- genbit = RXRPC_CALL_EV_ACK_FINAL;
-
- ack.bufferSpace = htons(8);
- ack.maxSkew = 0;
- ack.serial = 0;
- ack.reason = RXRPC_ACK_IDLE;
- ack.nAcks = 0;
- call->ackr_reason = 0;
-
- spin_lock_bh(&call->lock);
- ack.serial = htonl(call->ackr_serial);
- ack.previousPacket = htonl(call->ackr_prev_seq);
- ack.firstPacket = htonl(call->rx_data_eaten + 1);
- spin_unlock_bh(&call->lock);
-
- pad = 0;
-
- iov[1].iov_base = &ack;
- iov[1].iov_len = sizeof(ack);
- iov[2].iov_base = &pad;
- iov[2].iov_len = 3;
- iov[3].iov_base = &ackinfo;
- iov[3].iov_len = sizeof(ackinfo);
- goto send_ACK;
- }
-
- if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
- (1 << RXRPC_CALL_EV_RCVD_ABORT))
- ) {
- u32 mark;
-
- if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
- mark = RXRPC_SKB_MARK_REMOTE_ABORT;
- else
- mark = RXRPC_SKB_MARK_BUSY;
-
- _debug("post abort/busy");
- rxrpc_clear_tx_window(call);
- if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
- goto no_mem;
-
- clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
- clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- goto kill_ACKs;
- }
-
- if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
- _debug("do implicit ackall");
- rxrpc_clear_tx_window(call);
- }
-
- if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
- write_lock_bh(&call->state_lock);
- if (call->state <= RXRPC_CALL_COMPLETE) {
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->local_abort = RX_CALL_TIMEOUT;
- set_bit(RXRPC_CALL_EV_ABORT, &call->events);
- }
- write_unlock_bh(&call->state_lock);
-
- _debug("post timeout");
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
- ETIME, true) < 0)
- goto no_mem;
+ //printk("\n--------------------\n");
+ _enter("{%d,%s,%lx}",
+ call->debug_id, rxrpc_call_states[call->state], call->events);
- clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
- goto kill_ACKs;
+recheck_state:
+ if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
+ rxrpc_send_abort_packet(call);
+ goto recheck_state;
}
- /* deal with assorted inbound messages */
- if (!skb_queue_empty(&call->rx_queue)) {
- switch (rxrpc_process_rx_queue(call, &abort_code)) {
- case 0:
- case -EAGAIN:
- break;
- case -ENOMEM:
- goto no_mem;
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EPROTO:
- rxrpc_abort_call(call, abort_code);
- goto kill_ACKs;
- }
+ if (call->state == RXRPC_CALL_COMPLETE) {
+ del_timer_sync(&call->timer);
+ rxrpc_notify_socket(call);
+ goto out_put;
}
- /* handle resending */
- if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
- rxrpc_resend_timer(call);
- if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
- rxrpc_resend(call);
-
- /* consider sending an ordinary ACK */
- if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
- _debug("send ACK: window: %d - %d { %lx }",
- call->rx_data_eaten, call->ackr_win_top,
- call->ackr_window[0]);
-
- if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
- call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
- /* ACK by sending reply DATA packet in this state */
- clear_bit(RXRPC_CALL_EV_ACK, &call->events);
- goto maybe_reschedule;
- }
-
- genbit = RXRPC_CALL_EV_ACK;
-
- acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
- GFP_NOFS);
- if (!acks)
- goto no_mem;
-
- //hdr.flags = RXRPC_SLOW_START_OK;
- ack.bufferSpace = htons(8);
- ack.maxSkew = 0;
-
- spin_lock_bh(&call->lock);
- ack.reason = call->ackr_reason;
- ack.serial = htonl(call->ackr_serial);
- ack.previousPacket = htonl(call->ackr_prev_seq);
- ack.firstPacket = htonl(call->rx_data_eaten + 1);
-
- ack.nAcks = 0;
- for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
- nbit = loop * BITS_PER_LONG;
- for (bits = call->ackr_window[loop]; bits; bits >>= 1
- ) {
- _debug("- l=%d n=%d b=%lx", loop, nbit, bits);
- if (bits & 1) {
- acks[nbit] = RXRPC_ACK_TYPE_ACK;
- ack.nAcks = nbit + 1;
- }
- nbit++;
- }
- }
- call->ackr_reason = 0;
- spin_unlock_bh(&call->lock);
-
- pad = 0;
-
- iov[1].iov_base = &ack;
- iov[1].iov_len = sizeof(ack);
- iov[2].iov_base = acks;
- iov[2].iov_len = ack.nAcks;
- iov[3].iov_base = &pad;
- iov[3].iov_len = 3;
- iov[4].iov_base = &ackinfo;
- iov[4].iov_len = sizeof(ackinfo);
-
- switch (ack.reason) {
- case RXRPC_ACK_REQUESTED:
- case RXRPC_ACK_DUPLICATE:
- case RXRPC_ACK_OUT_OF_SEQUENCE:
- case RXRPC_ACK_EXCEEDS_WINDOW:
- case RXRPC_ACK_NOSPACE:
- case RXRPC_ACK_PING:
- case RXRPC_ACK_PING_RESPONSE:
- goto send_ACK_with_skew;
- case RXRPC_ACK_DELAY:
- case RXRPC_ACK_IDLE:
- goto send_ACK;
- }
+ now = ktime_get_real();
+ if (ktime_before(call->expire_at, now)) {
+ rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ goto recheck_state;
}
- /* handle completion of security negotiations on an incoming
- * connection */
- if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
- _debug("secured");
- spin_lock_bh(&call->lock);
-
- if (call->state == RXRPC_CALL_SERVER_SECURING) {
- _debug("securing");
- write_lock(&call->socket->call_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
- _debug("not released");
- call->state = RXRPC_CALL_SERVER_ACCEPTING;
- list_move_tail(&call->accept_link,
- &call->socket->acceptq);
- }
- write_unlock(&call->socket->call_lock);
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE)
- set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
- read_unlock(&call->state_lock);
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+ if (call->ackr_reason) {
+ rxrpc_send_ack_packet(call, false);
+ goto recheck_state;
}
-
- spin_unlock_bh(&call->lock);
- if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
- goto maybe_reschedule;
}
- /* post a notification of an acceptable connection to the app */
- if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
- _debug("post accept");
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
- 0, false) < 0)
- goto no_mem;
- clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
- goto maybe_reschedule;
+ if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) {
+ rxrpc_send_ack_packet(call, true);
+ goto recheck_state;
}
- /* handle incoming call acceptance */
- if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
- _debug("accepted");
- ASSERTCMP(call->rx_data_post, ==, 0);
- call->rx_data_post = 1;
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE)
- set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
- read_unlock_bh(&call->state_lock);
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) {
+ rxrpc_resend(call, now);
+ goto recheck_state;
}
- /* drain the out of sequence received packet queue into the packet Rx
- * queue */
- if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
- while (call->rx_data_post == call->rx_first_oos)
- if (rxrpc_drain_rx_oos_queue(call) < 0)
- break;
- goto maybe_reschedule;
- }
-
- if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
- rxrpc_release_call(call);
- clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
- }
+ rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
/* other events may have been raised since we started checking */
- goto maybe_reschedule;
-
-send_ACK_with_skew:
- ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) -
- ntohl(ack.serial));
-send_ACK:
- mtu = call->conn->params.peer->if_mtu;
- mtu -= call->conn->params.peer->hdrsize;
- ackinfo.maxMTU = htonl(mtu);
- ackinfo.rwind = htonl(rxrpc_rx_window_size);
-
- /* permit the peer to send us jumbo packets if it wants to */
- ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
- ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
-
- serial = atomic_inc_return(&call->conn->serial);
- whdr.serial = htonl(serial);
- _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- serial,
- ntohs(ack.maxSkew),
- ntohl(ack.firstPacket),
- ntohl(ack.previousPacket),
- ntohl(ack.serial),
- rxrpc_acks(ack.reason),
- ack.nAcks);
-
- del_timer_sync(&call->ack_timer);
- if (ack.nAcks > 0)
- set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
- goto send_message_2;
-
-send_message:
- _debug("send message");
-
- serial = atomic_inc_return(&call->conn->serial);
- whdr.serial = htonl(serial);
- _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
-send_message_2:
-
- len = iov[0].iov_len;
- ioc = 1;
- if (iov[4].iov_len) {
- ioc = 5;
- len += iov[4].iov_len;
- len += iov[3].iov_len;
- len += iov[2].iov_len;
- len += iov[1].iov_len;
- } else if (iov[3].iov_len) {
- ioc = 4;
- len += iov[3].iov_len;
- len += iov[2].iov_len;
- len += iov[1].iov_len;
- } else if (iov[2].iov_len) {
- ioc = 3;
- len += iov[2].iov_len;
- len += iov[1].iov_len;
- } else if (iov[1].iov_len) {
- ioc = 2;
- len += iov[1].iov_len;
- }
-
- ret = kernel_sendmsg(call->conn->params.local->socket,
- &msg, iov, ioc, len);
- if (ret < 0) {
- _debug("sendmsg failed: %d", ret);
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD)
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
- goto error;
- }
-
- switch (genbit) {
- case RXRPC_CALL_EV_ABORT:
- clear_bit(genbit, &call->events);
- clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- goto kill_ACKs;
-
- case RXRPC_CALL_EV_ACK_FINAL:
- write_lock_bh(&call->state_lock);
- if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
- call->state = RXRPC_CALL_COMPLETE;
- write_unlock_bh(&call->state_lock);
- goto kill_ACKs;
-
- default:
- clear_bit(genbit, &call->events);
- switch (call->state) {
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- case RXRPC_CALL_CLIENT_RECV_REPLY:
- case RXRPC_CALL_SERVER_RECV_REQUEST:
- case RXRPC_CALL_SERVER_ACK_REQUEST:
- _debug("start ACK timer");
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
- call->ackr_serial, false);
- default:
- break;
- }
- goto maybe_reschedule;
- }
-
-kill_ACKs:
- del_timer_sync(&call->ack_timer);
- if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
- rxrpc_put_call(call);
- clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-
-maybe_reschedule:
- if (call->events || !skb_queue_empty(&call->rx_queue)) {
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD)
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
- }
-
- /* don't leave aborted connections on the accept queue */
- if (call->state >= RXRPC_CALL_COMPLETE &&
- !list_empty(&call->accept_link)) {
- _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
- call, call->events, call->flags, call->conn->proto.cid);
-
- read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
- }
-
-error:
- clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags);
- kfree(acks);
-
- /* because we don't want two CPUs both processing the work item for one
- * call at the same time, we use a flag to note when it's busy; however
- * this means there's a race between clearing the flag and setting the
- * work pending bit and the work item being processed again */
- if (call->events && !work_pending(&call->processor)) {
- _debug("jumpstart %x", call->conn->proto.cid);
- rxrpc_queue_call(call);
+ if (call->events && call->state < RXRPC_CALL_COMPLETE) {
+ __rxrpc_queue_call(call);
+ goto out;
}
+out_put:
+ rxrpc_put_call(call, rxrpc_call_put);
+out:
_leave("");
- return;
-
-no_mem:
- _debug("out of memory");
- goto maybe_reschedule;
}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index ae057e0740f3..1ed18d8c9c9f 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -19,23 +19,13 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-/*
- * Maximum lifetime of a call (in jiffies).
- */
-unsigned int rxrpc_max_call_lifetime = 60 * HZ;
-
-/*
- * Time till dead call expires after last use (in jiffies).
- */
-unsigned int rxrpc_dead_call_expiry = 2 * HZ;
-
const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
- [RXRPC_CALL_UNINITIALISED] = "Uninit",
+ [RXRPC_CALL_UNINITIALISED] = "Uninit ",
[RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn",
[RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
[RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
[RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
- [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK",
+ [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc",
[RXRPC_CALL_SERVER_SECURING] = "SvSecure",
[RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept",
[RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq",
@@ -43,22 +33,47 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
[RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl",
[RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK",
[RXRPC_CALL_COMPLETE] = "Complete",
- [RXRPC_CALL_SERVER_BUSY] = "SvBusy ",
+};
+
+const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
+ [RXRPC_CALL_SUCCEEDED] = "Complete",
[RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort",
[RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort",
+ [RXRPC_CALL_LOCAL_ERROR] = "LocError",
[RXRPC_CALL_NETWORK_ERROR] = "NetError",
- [RXRPC_CALL_DEAD] = "Dead ",
+};
+
+const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
+ [rxrpc_call_new_client] = "NWc",
+ [rxrpc_call_new_service] = "NWs",
+ [rxrpc_call_queued] = "QUE",
+ [rxrpc_call_queued_ref] = "QUR",
+ [rxrpc_call_connected] = "CON",
+ [rxrpc_call_release] = "RLS",
+ [rxrpc_call_seen] = "SEE",
+ [rxrpc_call_got] = "GOT",
+ [rxrpc_call_got_userid] = "Gus",
+ [rxrpc_call_got_kernel] = "Gke",
+ [rxrpc_call_put] = "PUT",
+ [rxrpc_call_put_userid] = "Pus",
+ [rxrpc_call_put_kernel] = "Pke",
+ [rxrpc_call_put_noqueue] = "PNQ",
+ [rxrpc_call_error] = "*E*",
};
struct kmem_cache *rxrpc_call_jar;
LIST_HEAD(rxrpc_calls);
DEFINE_RWLOCK(rxrpc_call_lock);
-static void rxrpc_destroy_call(struct work_struct *work);
-static void rxrpc_call_life_expired(unsigned long _call);
-static void rxrpc_dead_call_expired(unsigned long _call);
-static void rxrpc_ack_time_expired(unsigned long _call);
-static void rxrpc_resend_time_expired(unsigned long _call);
+static void rxrpc_call_timer_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *)_call;
+
+ _enter("%d", call->debug_id);
+
+ if (call->state < RXRPC_CALL_COMPLETE)
+ rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
+}
/*
* find an extant server call
@@ -91,7 +106,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
return NULL;
found_extant_call:
- rxrpc_get_call(call);
+ rxrpc_get_call(call, rxrpc_call_got);
read_unlock(&rx->call_lock);
_leave(" = %p [%d]", call, atomic_read(&call->usage));
return call;
@@ -100,7 +115,7 @@ found_extant_call:
/*
* allocate a new call
*/
-static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
{
struct rxrpc_call *call;
@@ -108,29 +123,25 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
if (!call)
return NULL;
- call->acks_winsz = 16;
- call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+ call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE,
+ sizeof(struct sk_buff *),
gfp);
- if (!call->acks_window) {
- kmem_cache_free(rxrpc_call_jar, call);
- return NULL;
- }
+ if (!call->rxtx_buffer)
+ goto nomem;
+
+ call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp);
+ if (!call->rxtx_annotations)
+ goto nomem_2;
- setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
- (unsigned long) call);
- setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
- (unsigned long) call);
- setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
- (unsigned long) call);
- setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
- (unsigned long) call);
- INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
+ setup_timer(&call->timer, rxrpc_call_timer_expired,
+ (unsigned long)call);
INIT_WORK(&call->processor, &rxrpc_process_call);
INIT_LIST_HEAD(&call->link);
+ INIT_LIST_HEAD(&call->chan_wait_link);
INIT_LIST_HEAD(&call->accept_link);
- skb_queue_head_init(&call->rx_queue);
- skb_queue_head_init(&call->rx_oos_queue);
- init_waitqueue_head(&call->tx_waitq);
+ INIT_LIST_HEAD(&call->recvmsg_link);
+ INIT_LIST_HEAD(&call->sock_link);
+ init_waitqueue_head(&call->waitq);
spin_lock_init(&call->lock);
rwlock_init(&call->state_lock);
atomic_set(&call->usage, 1);
@@ -138,70 +149,66 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
memset(&call->sock_node, 0xed, sizeof(call->sock_node));
- call->rx_data_expect = 1;
- call->rx_data_eaten = 0;
- call->rx_first_oos = 0;
- call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size;
- call->creation_jif = jiffies;
+ /* Leave space in the ring to handle a maxed-out jumbo packet */
+ call->rx_winsize = rxrpc_rx_window_size;
+ call->tx_winsize = 16;
+ call->rx_expect_next = 1;
+
+ if (RXRPC_TX_SMSS > 2190)
+ call->cong_cwnd = 2;
+ else if (RXRPC_TX_SMSS > 1095)
+ call->cong_cwnd = 3;
+ else
+ call->cong_cwnd = 4;
+ call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
return call;
+
+nomem_2:
+ kfree(call->rxtx_buffer);
+nomem:
+ kmem_cache_free(rxrpc_call_jar, call);
+ return NULL;
}
/*
* Allocate a new client call.
*/
-static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
- struct sockaddr_rxrpc *srx,
+static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
gfp_t gfp)
{
struct rxrpc_call *call;
+ ktime_t now;
_enter("");
- ASSERT(rx->local != NULL);
-
call = rxrpc_alloc_call(gfp);
if (!call)
return ERR_PTR(-ENOMEM);
call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
-
- sock_hold(&rx->sk);
- call->socket = rx;
- call->rx_data_post = 1;
-
- call->local = rx->local;
call->service_id = srx->srx_service;
- call->in_clientflag = 0;
+ call->tx_phase = true;
+ now = ktime_get_real();
+ call->acks_latest_ts = now;
+ call->cong_tstamp = now;
_leave(" = %p", call);
return call;
}
/*
- * Begin client call.
+ * Initiate the call ack/resend/expiry timer.
*/
-static int rxrpc_begin_client_call(struct rxrpc_call *call,
- struct rxrpc_conn_parameters *cp,
- struct sockaddr_rxrpc *srx,
- gfp_t gfp)
+static void rxrpc_start_call_timer(struct rxrpc_call *call)
{
- int ret;
-
- /* Set up or get a connection record and set the protocol parameters,
- * including channel number and call ID.
- */
- ret = rxrpc_connect_call(call, cp, srx, gfp);
- if (ret < 0)
- return ret;
-
- call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
-
- spin_lock(&call->conn->params.peer->lock);
- hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
- spin_unlock(&call->conn->params.peer->lock);
-
- call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
- add_timer(&call->lifetimer);
- return 0;
+ ktime_t now = ktime_get_real(), expire_at;
+
+ expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
+ call->expire_at = expire_at;
+ call->ack_at = expire_at;
+ call->ping_at = expire_at;
+ call->resend_at = expire_at;
+ call->timer.expires = jiffies + LONG_MAX / 2;
+ rxrpc_set_timer(call, rxrpc_timer_begin, now);
}
/*
@@ -216,20 +223,21 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
{
struct rxrpc_call *call, *xcall;
struct rb_node *parent, **pp;
+ const void *here = __builtin_return_address(0);
int ret;
_enter("%p,%lx", rx, user_call_ID);
- call = rxrpc_alloc_client_call(rx, srx, gfp);
+ call = rxrpc_alloc_client_call(srx, gfp);
if (IS_ERR(call)) {
_leave(" = %ld", PTR_ERR(call));
return call;
}
- /* Publish the call, even though it is incompletely set up as yet */
- call->user_call_ID = user_call_ID;
- __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
+ here, (const void *)user_call_ID);
+ /* Publish the call, even though it is incompletely set up as yet */
write_lock(&rx->call_lock);
pp = &rx->calls.rb_node;
@@ -243,369 +251,285 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
else if (user_call_ID > xcall->user_call_ID)
pp = &(*pp)->rb_right;
else
- goto found_user_ID_now_present;
+ goto error_dup_user_ID;
}
- rxrpc_get_call(call);
-
+ rcu_assign_pointer(call->socket, rx);
+ call->user_call_ID = user_call_ID;
+ __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ rxrpc_get_call(call, rxrpc_call_got_userid);
rb_link_node(&call->sock_node, parent, pp);
rb_insert_color(&call->sock_node, &rx->calls);
+ list_add(&call->sock_link, &rx->sock_calls);
+
write_unlock(&rx->call_lock);
- write_lock_bh(&rxrpc_call_lock);
+ write_lock(&rxrpc_call_lock);
list_add_tail(&call->link, &rxrpc_calls);
- write_unlock_bh(&rxrpc_call_lock);
+ write_unlock(&rxrpc_call_lock);
- ret = rxrpc_begin_client_call(call, cp, srx, gfp);
+ /* Set up or get a connection record and set the protocol parameters,
+ * including channel number and call ID.
+ */
+ ret = rxrpc_connect_call(call, cp, srx, gfp);
if (ret < 0)
goto error;
- _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
+ trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
+ here, NULL);
- _leave(" = %p [new]", call);
- return call;
+ spin_lock_bh(&call->conn->params.peer->lock);
+ hlist_add_head(&call->error_link,
+ &call->conn->params.peer->error_targets);
+ spin_unlock_bh(&call->conn->params.peer->lock);
-error:
- write_lock(&rx->call_lock);
- rb_erase(&call->sock_node, &rx->calls);
- write_unlock(&rx->call_lock);
- rxrpc_put_call(call);
+ rxrpc_start_call_timer(call);
- write_lock_bh(&rxrpc_call_lock);
- list_del_init(&call->link);
- write_unlock_bh(&rxrpc_call_lock);
+ _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
- set_bit(RXRPC_CALL_RELEASED, &call->flags);
- call->state = RXRPC_CALL_DEAD;
- rxrpc_put_call(call);
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ _leave(" = %p [new]", call);
+ return call;
/* We unexpectedly found the user ID in the list after taking
* the call_lock. This shouldn't happen unless the user races
* with itself and tries to add the same user ID twice at the
* same time in different threads.
*/
-found_user_ID_now_present:
+error_dup_user_ID:
write_unlock(&rx->call_lock);
- set_bit(RXRPC_CALL_RELEASED, &call->flags);
- call->state = RXRPC_CALL_DEAD;
- rxrpc_put_call(call);
- _leave(" = -EEXIST [%p]", call);
- return ERR_PTR(-EEXIST);
+ ret = -EEXIST;
+
+error:
+ __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+ RX_CALL_DEAD, ret);
+ trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
+ here, ERR_PTR(ret));
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
}
/*
- * set up an incoming call
- * - called in process context with IRQs enabled
+ * Set up an incoming call. call->conn points to the connection.
+ * This is called in BH context and isn't allowed to fail.
*/
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
- struct rxrpc_connection *conn,
- struct sk_buff *skb)
+void rxrpc_incoming_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct sk_buff *skb)
{
+ struct rxrpc_connection *conn = call->conn;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_call *call, *candidate;
- u32 call_id, chan;
-
- _enter(",%d", conn->debug_id);
-
- ASSERT(rx != NULL);
-
- candidate = rxrpc_alloc_call(GFP_NOIO);
- if (!candidate)
- return ERR_PTR(-EBUSY);
-
- chan = sp->hdr.cid & RXRPC_CHANNELMASK;
- candidate->socket = rx;
- candidate->conn = conn;
- candidate->cid = sp->hdr.cid;
- candidate->call_id = sp->hdr.callNumber;
- candidate->channel = chan;
- candidate->rx_data_post = 0;
- candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
- if (conn->security_ix > 0)
- candidate->state = RXRPC_CALL_SERVER_SECURING;
-
- spin_lock(&conn->channel_lock);
-
- /* set the channel for this call */
- call = rcu_dereference_protected(conn->channels[chan].call,
- lockdep_is_held(&conn->channel_lock));
-
- _debug("channel[%u] is %p", candidate->channel, call);
- if (call && call->call_id == sp->hdr.callNumber) {
- /* already set; must've been a duplicate packet */
- _debug("extant call [%d]", call->state);
- ASSERTCMP(call->conn, ==, conn);
-
- read_lock(&call->state_lock);
- switch (call->state) {
- case RXRPC_CALL_LOCALLY_ABORTED:
- if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
- rxrpc_queue_call(call);
- case RXRPC_CALL_REMOTELY_ABORTED:
- read_unlock(&call->state_lock);
- goto aborted_call;
- default:
- rxrpc_get_call(call);
- read_unlock(&call->state_lock);
- goto extant_call;
- }
- }
-
- if (call) {
- /* it seems the channel is still in use from the previous call
- * - ditch the old binding if its call is now complete */
- _debug("CALL: %u { %s }",
- call->debug_id, rxrpc_call_states[call->state]);
-
- if (call->state >= RXRPC_CALL_COMPLETE) {
- __rxrpc_disconnect_call(call);
- } else {
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = -EBUSY");
- return ERR_PTR(-EBUSY);
- }
- }
-
- /* check the call number isn't duplicate */
- _debug("check dup");
- call_id = sp->hdr.callNumber;
-
- /* We just ignore calls prior to the current call ID. Terminated calls
- * are handled via the connection.
+ u32 chan;
+
+ _enter(",%d", call->conn->debug_id);
+
+ rcu_assign_pointer(call->socket, rx);
+ call->call_id = sp->hdr.callNumber;
+ call->service_id = sp->hdr.serviceId;
+ call->cid = sp->hdr.cid;
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ if (sp->hdr.securityIndex > 0)
+ call->state = RXRPC_CALL_SERVER_SECURING;
+ call->cong_tstamp = skb->tstamp;
+
+ /* Set the channel for this call. We don't get channel_lock as we're
+ * only defending against the data_ready handler (which we're called
+ * from) and the RESPONSE packet parser (which is only really
+ * interested in call_counter and can cope with a disagreement with the
+ * call pointer).
*/
- if (call_id <= conn->channels[chan].call_counter)
- goto old_call; /* TODO: Just drop packet */
-
- /* make the call available */
- _debug("new call");
- call = candidate;
- candidate = NULL;
- conn->channels[chan].call_counter = call_id;
+ chan = sp->hdr.cid & RXRPC_CHANNELMASK;
+ conn->channels[chan].call_counter = call->call_id;
+ conn->channels[chan].call_id = call->call_id;
rcu_assign_pointer(conn->channels[chan].call, call);
- sock_hold(&rx->sk);
- rxrpc_get_connection(conn);
- spin_unlock(&conn->channel_lock);
spin_lock(&conn->params.peer->lock);
hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
spin_unlock(&conn->params.peer->lock);
- write_lock_bh(&rxrpc_call_lock);
- list_add_tail(&call->link, &rxrpc_calls);
- write_unlock_bh(&rxrpc_call_lock);
+ _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
- call->local = conn->params.local;
- call->epoch = conn->proto.epoch;
- call->service_id = conn->params.service_id;
- call->in_clientflag = RXRPC_CLIENT_INITIATED;
+ rxrpc_start_call_timer(call);
+ _leave("");
+}
- _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
+/*
+ * Queue a call's work processor, getting a ref to pass to the work queue.
+ */
+bool rxrpc_queue_call(struct rxrpc_call *call)
+{
+ const void *here = __builtin_return_address(0);
+ int n = __atomic_add_unless(&call->usage, 1, 0);
+ if (n == 0)
+ return false;
+ if (rxrpc_queue_work(&call->processor))
+ trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL);
+ else
+ rxrpc_put_call(call, rxrpc_call_put_noqueue);
+ return true;
+}
- call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
- add_timer(&call->lifetimer);
- _leave(" = %p {%d} [new]", call, call->debug_id);
- return call;
+/*
+ * Queue a call's work processor, passing the callers ref to the work queue.
+ */
+bool __rxrpc_queue_call(struct rxrpc_call *call)
+{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_read(&call->usage);
+ ASSERTCMP(n, >=, 1);
+ if (rxrpc_queue_work(&call->processor))
+ trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL);
+ else
+ rxrpc_put_call(call, rxrpc_call_put_noqueue);
+ return true;
+}
-extant_call:
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
- return call;
+/*
+ * Note the re-emergence of a call.
+ */
+void rxrpc_see_call(struct rxrpc_call *call)
+{
+ const void *here = __builtin_return_address(0);
+ if (call) {
+ int n = atomic_read(&call->usage);
-aborted_call:
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = -ECONNABORTED");
- return ERR_PTR(-ECONNABORTED);
-
-old_call:
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = -ECONNRESET [old]");
- return ERR_PTR(-ECONNRESET);
+ trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL);
+ }
}
/*
- * detach a call from a socket and set up for release
+ * Note the addition of a ref on a call.
*/
-void rxrpc_release_call(struct rxrpc_call *call)
+void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(&call->usage);
+
+ trace_rxrpc_call(call, op, n, here, NULL);
+}
+
+/*
+ * Detach a call from its owning socket.
+ */
+void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
+{
+ const void *here = __builtin_return_address(0);
struct rxrpc_connection *conn = call->conn;
- struct rxrpc_sock *rx = call->socket;
+ bool put = false;
+ int i;
- _enter("{%d,%d,%d,%d}",
- call->debug_id, atomic_read(&call->usage),
- atomic_read(&call->ackr_not_idle),
- call->rx_first_oos);
+ _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
+
+ trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
+ here, (const void *)call->flags);
+
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
spin_lock_bh(&call->lock);
if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
spin_unlock_bh(&call->lock);
- /* dissociate from the socket
- * - the socket's ref on the call is passed to the death timer
- */
- _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+ del_timer_sync(&call->timer);
- spin_lock(&conn->params.peer->lock);
- hlist_del_init(&call->error_link);
- spin_unlock(&conn->params.peer->lock);
+ /* Make sure we don't get any more notifications */
+ write_lock_bh(&rx->recvmsg_lock);
- write_lock_bh(&rx->call_lock);
- if (!list_empty(&call->accept_link)) {
+ if (!list_empty(&call->recvmsg_link)) {
_debug("unlinking once-pending call %p { e=%lx f=%lx }",
call, call->events, call->flags);
- ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
- list_del_init(&call->accept_link);
- sk_acceptq_removed(&rx->sk);
- } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- rb_erase(&call->sock_node, &rx->calls);
- memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
- clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ list_del(&call->recvmsg_link);
+ put = true;
}
- write_unlock_bh(&rx->call_lock);
- /* free up the channel for reuse */
- write_lock_bh(&call->state_lock);
+ /* list_empty() must return false in rxrpc_notify_socket() */
+ call->recvmsg_link.next = NULL;
+ call->recvmsg_link.prev = NULL;
- if (call->state < RXRPC_CALL_COMPLETE &&
- call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
- _debug("+++ ABORTING STATE %d +++\n", call->state);
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->local_abort = RX_CALL_DEAD;
- }
- write_unlock_bh(&call->state_lock);
+ write_unlock_bh(&rx->recvmsg_lock);
+ if (put)
+ rxrpc_put_call(call, rxrpc_call_put);
- rxrpc_disconnect_call(call);
+ write_lock(&rx->call_lock);
- /* clean up the Rx queue */
- if (!skb_queue_empty(&call->rx_queue) ||
- !skb_queue_empty(&call->rx_oos_queue)) {
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
+ if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ rb_erase(&call->sock_node, &rx->calls);
+ memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
+ rxrpc_put_call(call, rxrpc_call_put_userid);
+ }
- _debug("purge Rx queues");
+ list_del(&call->sock_link);
+ write_unlock(&rx->call_lock);
- spin_lock_bh(&call->lock);
- while ((skb = skb_dequeue(&call->rx_queue)) ||
- (skb = skb_dequeue(&call->rx_oos_queue))) {
- spin_unlock_bh(&call->lock);
+ _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
- sp = rxrpc_skb(skb);
- _debug("- zap %s %%%u #%u",
- rxrpc_pkts[sp->hdr.type],
- sp->hdr.serial, sp->hdr.seq);
- rxrpc_free_skb(skb);
- spin_lock_bh(&call->lock);
- }
- spin_unlock_bh(&call->lock);
+ if (conn)
+ rxrpc_disconnect_call(call);
- ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE);
+ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
+ rxrpc_free_skb(call->rxtx_buffer[i],
+ (call->tx_phase ? rxrpc_skb_tx_cleaned :
+ rxrpc_skb_rx_cleaned));
+ call->rxtx_buffer[i] = NULL;
}
- del_timer_sync(&call->resend_timer);
- del_timer_sync(&call->ack_timer);
- del_timer_sync(&call->lifetimer);
- call->deadspan.expires = jiffies + rxrpc_dead_call_expiry;
- add_timer(&call->deadspan);
-
_leave("");
}
/*
- * handle a dead call being ready for reaping
- */
-static void rxrpc_dead_call_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
- _enter("{%d}", call->debug_id);
-
- write_lock_bh(&call->state_lock);
- call->state = RXRPC_CALL_DEAD;
- write_unlock_bh(&call->state_lock);
- rxrpc_put_call(call);
-}
-
-/*
- * mark a call as to be released, aborting it if it's still in progress
- * - called with softirqs disabled
- */
-static void rxrpc_mark_call_released(struct rxrpc_call *call)
-{
- bool sched;
-
- write_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD) {
- sched = false;
- if (call->state < RXRPC_CALL_COMPLETE) {
- _debug("abort call %p", call);
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->local_abort = RX_CALL_DEAD;
- if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
- sched = true;
- }
- if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- sched = true;
- if (sched)
- rxrpc_queue_call(call);
- }
- write_unlock(&call->state_lock);
-}
-
-/*
* release all the calls associated with a socket
*/
void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
{
struct rxrpc_call *call;
- struct rb_node *p;
_enter("%p", rx);
- read_lock_bh(&rx->call_lock);
-
- /* mark all the calls as no longer wanting incoming packets */
- for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
- call = rb_entry(p, struct rxrpc_call, sock_node);
- rxrpc_mark_call_released(call);
- }
-
- /* kill the not-yet-accepted incoming calls */
- list_for_each_entry(call, &rx->secureq, accept_link) {
- rxrpc_mark_call_released(call);
+ while (!list_empty(&rx->to_be_accepted)) {
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
+ list_del(&call->accept_link);
+ rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
+ rxrpc_put_call(call, rxrpc_call_put);
}
- list_for_each_entry(call, &rx->acceptq, accept_link) {
- rxrpc_mark_call_released(call);
+ while (!list_empty(&rx->sock_calls)) {
+ call = list_entry(rx->sock_calls.next,
+ struct rxrpc_call, sock_link);
+ rxrpc_get_call(call, rxrpc_call_got);
+ rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
+ rxrpc_send_abort_packet(call);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
}
- read_unlock_bh(&rx->call_lock);
_leave("");
}
/*
* release a call
*/
-void __rxrpc_put_call(struct rxrpc_call *call)
+void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
{
+ const void *here = __builtin_return_address(0);
+ int n;
+
ASSERT(call != NULL);
- _enter("%p{u=%d}", call, atomic_read(&call->usage));
+ n = atomic_dec_return(&call->usage);
+ trace_rxrpc_call(call, op, n, here, NULL);
+ ASSERTCMP(n, >=, 0);
+ if (n == 0) {
+ _debug("call %d dead", call->debug_id);
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
- ASSERTCMP(atomic_read(&call->usage), >, 0);
+ write_lock(&rxrpc_call_lock);
+ list_del_init(&call->link);
+ write_unlock(&rxrpc_call_lock);
- if (atomic_dec_and_test(&call->usage)) {
- _debug("call %d dead", call->debug_id);
- WARN_ON(atomic_read(&call->skb_count) != 0);
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
- rxrpc_queue_work(&call->destroyer);
+ rxrpc_cleanup_call(call);
}
- _leave("");
}
/*
@@ -615,187 +539,70 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
{
struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
- rxrpc_purge_queue(&call->rx_queue);
+ rxrpc_put_peer(call->peer);
+ kfree(call->rxtx_buffer);
+ kfree(call->rxtx_annotations);
kmem_cache_free(rxrpc_call_jar, call);
}
/*
* clean up a call
*/
-static void rxrpc_cleanup_call(struct rxrpc_call *call)
+void rxrpc_cleanup_call(struct rxrpc_call *call)
{
- _net("DESTROY CALL %d", call->debug_id);
+ int i;
- ASSERT(call->socket);
+ _net("DESTROY CALL %d", call->debug_id);
memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
- del_timer_sync(&call->lifetimer);
- del_timer_sync(&call->deadspan);
- del_timer_sync(&call->ack_timer);
- del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->timer);
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
- ASSERTCMP(call->events, ==, 0);
- if (work_pending(&call->processor)) {
- _debug("defer destroy");
- rxrpc_queue_work(&call->destroyer);
- return;
- }
-
ASSERTCMP(call->conn, ==, NULL);
- if (call->acks_window) {
- _debug("kill Tx window %d",
- CIRC_CNT(call->acks_head, call->acks_tail,
- call->acks_winsz));
- smp_mb();
- while (CIRC_CNT(call->acks_head, call->acks_tail,
- call->acks_winsz) > 0) {
- struct rxrpc_skb_priv *sp;
- unsigned long _skb;
-
- _skb = call->acks_window[call->acks_tail] & ~1;
- sp = rxrpc_skb((struct sk_buff *)_skb);
- _debug("+++ clear Tx %u", sp->hdr.seq);
- rxrpc_free_skb((struct sk_buff *)_skb);
- call->acks_tail =
- (call->acks_tail + 1) & (call->acks_winsz - 1);
- }
-
- kfree(call->acks_window);
- }
+ /* Clean up the Rx/Tx buffer */
+ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
+ rxrpc_free_skb(call->rxtx_buffer[i],
+ (call->tx_phase ? rxrpc_skb_tx_cleaned :
+ rxrpc_skb_rx_cleaned));
- rxrpc_free_skb(call->tx_pending);
+ rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
- rxrpc_purge_queue(&call->rx_queue);
- ASSERT(skb_queue_empty(&call->rx_oos_queue));
- sock_put(&call->socket->sk);
call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
}
/*
- * destroy a call
- */
-static void rxrpc_destroy_call(struct work_struct *work)
-{
- struct rxrpc_call *call =
- container_of(work, struct rxrpc_call, destroyer);
-
- _enter("%p{%d,%d,%p}",
- call, atomic_read(&call->usage), call->channel, call->conn);
-
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
-
- write_lock_bh(&rxrpc_call_lock);
- list_del_init(&call->link);
- write_unlock_bh(&rxrpc_call_lock);
-
- rxrpc_cleanup_call(call);
- _leave("");
-}
-
-/*
- * preemptively destroy all the call records from a transport endpoint rather
- * than waiting for them to time out
+ * Make sure that all calls are gone.
*/
void __exit rxrpc_destroy_all_calls(void)
{
struct rxrpc_call *call;
_enter("");
- write_lock_bh(&rxrpc_call_lock);
+
+ if (list_empty(&rxrpc_calls))
+ return;
+
+ write_lock(&rxrpc_call_lock);
while (!list_empty(&rxrpc_calls)) {
call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
_debug("Zapping call %p", call);
+ rxrpc_see_call(call);
list_del_init(&call->link);
- switch (atomic_read(&call->usage)) {
- case 0:
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
- break;
- case 1:
- if (del_timer_sync(&call->deadspan) != 0 &&
- call->state != RXRPC_CALL_DEAD)
- rxrpc_dead_call_expired((unsigned long) call);
- if (call->state != RXRPC_CALL_DEAD)
- break;
- default:
- pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
- call, atomic_read(&call->usage),
- atomic_read(&call->ackr_not_idle),
- rxrpc_call_states[call->state],
- call->flags, call->events);
- if (!skb_queue_empty(&call->rx_queue))
- pr_err("Rx queue occupied\n");
- if (!skb_queue_empty(&call->rx_oos_queue))
- pr_err("OOS queue occupied\n");
- break;
- }
-
- write_unlock_bh(&rxrpc_call_lock);
- cond_resched();
- write_lock_bh(&rxrpc_call_lock);
- }
-
- write_unlock_bh(&rxrpc_call_lock);
- _leave("");
-}
-
-/*
- * handle call lifetime being exceeded
- */
-static void rxrpc_call_life_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
+ pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n",
+ call, atomic_read(&call->usage),
+ rxrpc_call_states[call->state],
+ call->flags, call->events);
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- _enter("{%d}", call->debug_id);
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE) {
- set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
- rxrpc_queue_call(call);
+ write_unlock(&rxrpc_call_lock);
+ cond_resched();
+ write_lock(&rxrpc_call_lock);
}
- read_unlock_bh(&call->state_lock);
-}
-
-/*
- * handle resend timer expiry
- * - may not take call->state_lock as this can deadlock against del_timer_sync()
- */
-static void rxrpc_resend_time_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
- _enter("{%d}", call->debug_id);
-
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
- rxrpc_queue_call(call);
-}
-
-/*
- * handle ACK timer expiry
- */
-static void rxrpc_ack_time_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
- _enter("{%d}", call->debug_id);
-
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
+ write_unlock(&rxrpc_call_lock);
}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 9e91f27b0d0f..6cbcdcc29853 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -7,6 +7,68 @@
* modify it under the terms of the GNU General Public Licence
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
+ *
+ *
+ * Client connections need to be cached for a little while after they've made a
+ * call so as to handle retransmitted DATA packets in case the server didn't
+ * receive the final ACK or terminating ABORT we sent it.
+ *
+ * Client connections can be in one of a number of cache states:
+ *
+ * (1) INACTIVE - The connection is not held in any list and may not have been
+ * exposed to the world. If it has been previously exposed, it was
+ * discarded from the idle list after expiring.
+ *
+ * (2) WAITING - The connection is waiting for the number of client conns to
+ * drop below the maximum capacity. Calls may be in progress upon it from
+ * when it was active and got culled.
+ *
+ * The connection is on the rxrpc_waiting_client_conns list which is kept
+ * in to-be-granted order. Culled conns with waiters go to the back of
+ * the queue just like new conns.
+ *
+ * (3) ACTIVE - The connection has at least one call in progress upon it, it
+ * may freely grant available channels to new calls and calls may be
+ * waiting on it for channels to become available.
+ *
+ * The connection is on the rxrpc_active_client_conns list which is kept
+ * in activation order for culling purposes.
+ *
+ * rxrpc_nr_active_client_conns is held incremented also.
+ *
+ * (4) CULLED - The connection got summarily culled to try and free up
+ * capacity. Calls currently in progress on the connection are allowed to
+ * continue, but new calls will have to wait. There can be no waiters in
+ * this state - the conn would have to go to the WAITING state instead.
+ *
+ * (5) IDLE - The connection has no calls in progress upon it and must have
+ * been exposed to the world (ie. the EXPOSED flag must be set). When it
+ * expires, the EXPOSED flag is cleared and the connection transitions to
+ * the INACTIVE state.
+ *
+ * The connection is on the rxrpc_idle_client_conns list which is kept in
+ * order of how soon they'll expire.
+ *
+ * There are flags of relevance to the cache:
+ *
+ * (1) EXPOSED - The connection ID got exposed to the world. If this flag is
+ * set, an extra ref is added to the connection preventing it from being
+ * reaped when it has no calls outstanding. This flag is cleared and the
+ * ref dropped when a conn is discarded from the idle list.
+ *
+ * This allows us to move terminal call state retransmission to the
+ * connection and to discard the call immediately we think it is done
+ * with. It also give us a chance to reuse the connection.
+ *
+ * (2) DONT_REUSE - The connection should be discarded as soon as possible and
+ * should not be reused. This is set when an exclusive connection is used
+ * or a call ID counter overflows.
+ *
+ * The caching state may only be changed if the cache lock is held.
+ *
+ * There are two idle client connection expiry durations. If the total number
+ * of connections is below the reap threshold, we use the normal duration; if
+ * it's above, we use the fast duration.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -16,27 +78,50 @@
#include <linux/timer.h>
#include "ar-internal.h"
+__read_mostly unsigned int rxrpc_max_client_connections = 1000;
+__read_mostly unsigned int rxrpc_reap_client_connections = 900;
+__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
+__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
+
+static unsigned int rxrpc_nr_client_conns;
+static unsigned int rxrpc_nr_active_client_conns;
+static __read_mostly bool rxrpc_kill_all_client_conns;
+
+static DEFINE_SPINLOCK(rxrpc_client_conn_cache_lock);
+static DEFINE_SPINLOCK(rxrpc_client_conn_discard_mutex);
+static LIST_HEAD(rxrpc_waiting_client_conns);
+static LIST_HEAD(rxrpc_active_client_conns);
+static LIST_HEAD(rxrpc_idle_client_conns);
+
/*
* We use machine-unique IDs for our client connections.
*/
DEFINE_IDR(rxrpc_client_conn_ids);
static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
+static void rxrpc_cull_active_client_conns(void);
+static void rxrpc_discard_expired_client_conns(struct work_struct *);
+
+static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
+ rxrpc_discard_expired_client_conns);
+
+const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
+ [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
+ [RXRPC_CONN_CLIENT_WAITING] = "Wait",
+ [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
+ [RXRPC_CONN_CLIENT_CULLED] = "Cull",
+ [RXRPC_CONN_CLIENT_IDLE] = "Idle",
+};
+
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
- * epoch is changed if this wraps.
- *
- * TODO: The IDR tree gets very expensive on memory if the connection IDs are
- * widely scattered throughout the number space, so we shall need to retire
- * connections that have, say, an ID more than four times the maximum number of
- * client conns away from the current allocation point to try and keep the IDs
- * concentrated. We will also need to retire connections from an old epoch.
+ * epoch doesn't change until the client is rebooted (or, at least, unless the
+ * module is unloaded).
*/
static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn,
gfp_t gfp)
{
- u32 epoch;
int id;
_enter("");
@@ -44,34 +129,18 @@ static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn,
idr_preload(gfp);
spin_lock(&rxrpc_conn_id_lock);
- epoch = rxrpc_epoch;
-
- /* We could use idr_alloc_cyclic() here, but we really need to know
- * when the thing wraps so that we can advance the epoch.
- */
- if (rxrpc_client_conn_ids.cur == 0)
- rxrpc_client_conn_ids.cur = 1;
- id = idr_alloc(&rxrpc_client_conn_ids, conn,
- rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT);
- if (id < 0) {
- if (id != -ENOSPC)
- goto error;
- id = idr_alloc(&rxrpc_client_conn_ids, conn,
- 1, 0x40000000, GFP_NOWAIT);
- if (id < 0)
- goto error;
- epoch++;
- rxrpc_epoch = epoch;
- }
- rxrpc_client_conn_ids.cur = id + 1;
+ id = idr_alloc_cyclic(&rxrpc_client_conn_ids, conn,
+ 1, 0x40000000, GFP_NOWAIT);
+ if (id < 0)
+ goto error;
spin_unlock(&rxrpc_conn_id_lock);
idr_preload_end();
- conn->proto.epoch = epoch;
+ conn->proto.epoch = rxrpc_epoch;
conn->proto.cid = id << RXRPC_CIDSHIFT;
set_bit(RXRPC_CONN_HAS_IDR, &conn->flags);
- _leave(" [CID %x:%x]", epoch, conn->proto.cid);
+ _leave(" [CID %x]", conn->proto.cid);
return 0;
error:
@@ -114,8 +183,7 @@ void rxrpc_destroy_client_conn_ids(void)
}
/*
- * Allocate a client connection. The caller must take care to clear any
- * padding bytes in *cp.
+ * Allocate a client connection.
*/
static struct rxrpc_connection *
rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
@@ -131,6 +199,10 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
return ERR_PTR(-ENOMEM);
}
+ atomic_set(&conn->usage, 1);
+ if (cp->exclusive)
+ __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
+
conn->params = *cp;
conn->out_clientflag = RXRPC_CLIENT_INITIATED;
conn->state = RXRPC_CONN_CLIENT;
@@ -148,7 +220,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
goto error_2;
write_lock(&rxrpc_connection_lock);
- list_add_tail(&conn->link, &rxrpc_connections);
+ list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
write_unlock(&rxrpc_connection_lock);
/* We steal the caller's peer ref. */
@@ -156,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
rxrpc_get_local(conn->params.local);
key_get(conn->params.key);
+ trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
+ __builtin_return_address(0));
+ trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
_leave(" = %p", conn);
return conn;
@@ -170,32 +245,68 @@ error_0:
}
/*
- * find a connection for a call
- * - called in process context with IRQs enabled
+ * Determine if a connection may be reused.
*/
-int rxrpc_connect_call(struct rxrpc_call *call,
- struct rxrpc_conn_parameters *cp,
- struct sockaddr_rxrpc *srx,
- gfp_t gfp)
+static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
+{
+ int id_cursor, id, distance, limit;
+
+ if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags))
+ goto dont_reuse;
+
+ if (conn->proto.epoch != rxrpc_epoch)
+ goto mark_dont_reuse;
+
+ /* The IDR tree gets very expensive on memory if the connection IDs are
+ * widely scattered throughout the number space, so we shall want to
+ * kill off connections that, say, have an ID more than about four
+ * times the maximum number of client conns away from the current
+ * allocation point to try and keep the IDs concentrated.
+ */
+ id_cursor = idr_get_cursor(&rxrpc_client_conn_ids);
+ id = conn->proto.cid >> RXRPC_CIDSHIFT;
+ distance = id - id_cursor;
+ if (distance < 0)
+ distance = -distance;
+ limit = max(rxrpc_max_client_connections * 4, 1024U);
+ if (distance > limit)
+ goto mark_dont_reuse;
+
+ return true;
+
+mark_dont_reuse:
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
+dont_reuse:
+ return false;
+}
+
+/*
+ * Create or find a client connection to use for a call.
+ *
+ * If we return with a connection, the call will be on its waiting list. It's
+ * left to the caller to assign a channel and wake up the call.
+ */
+static int rxrpc_get_client_conn(struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
{
struct rxrpc_connection *conn, *candidate = NULL;
struct rxrpc_local *local = cp->local;
struct rb_node *p, **pp, *parent;
long diff;
- int chan;
-
- DECLARE_WAITQUEUE(myself, current);
+ int ret = -ENOMEM;
_enter("{%d,%lx},", call->debug_id, call->user_call_ID);
cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp);
if (!cp->peer)
- return -ENOMEM;
+ goto error;
+ /* If the connection is not meant to be exclusive, search the available
+ * connections to see if the connection we want to use already exists.
+ */
if (!cp->exclusive) {
- /* Search for a existing client connection unless this is going
- * to be a connection that's used exclusively for a single call.
- */
_debug("search 1");
spin_lock(&local->client_conns_lock);
p = local->client_conns.rb_node;
@@ -206,39 +317,56 @@ int rxrpc_connect_call(struct rxrpc_call *call,
diff = (cmp(peer) ?:
cmp(key) ?:
cmp(security_level));
- if (diff < 0)
+#undef cmp
+ if (diff < 0) {
p = p->rb_left;
- else if (diff > 0)
+ } else if (diff > 0) {
p = p->rb_right;
- else
- goto found_extant_conn;
+ } else {
+ if (rxrpc_may_reuse_conn(conn) &&
+ rxrpc_get_connection_maybe(conn))
+ goto found_extant_conn;
+ /* The connection needs replacing. It's better
+ * to effect that when we have something to
+ * replace it with so that we don't have to
+ * rebalance the tree twice.
+ */
+ break;
+ }
}
spin_unlock(&local->client_conns_lock);
}
- /* We didn't find a connection or we want an exclusive one. */
- _debug("get new conn");
+ /* There wasn't a connection yet or we need an exclusive connection.
+ * We need to create a candidate and then potentially redo the search
+ * in case we're racing with another thread also trying to connect on a
+ * shareable connection.
+ */
+ _debug("new conn");
candidate = rxrpc_alloc_client_connection(cp, gfp);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
+ if (IS_ERR(candidate)) {
+ ret = PTR_ERR(candidate);
+ goto error_peer;
}
+ /* Add the call to the new connection's waiting list in case we're
+ * going to have to wait for the connection to come live. It's our
+ * connection, so we want first dibs on the channel slots. We would
+ * normally have to take channel_lock but we do this before anyone else
+ * can see the connection.
+ */
+ list_add_tail(&call->chan_wait_link, &candidate->waiting_calls);
+
if (cp->exclusive) {
- /* Assign the call on an exclusive connection to channel 0 and
- * don't add the connection to the endpoint's shareable conn
- * lookup tree.
- */
- _debug("exclusive chan 0");
- conn = candidate;
- atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
- spin_lock(&conn->channel_lock);
- chan = 0;
- goto found_channel;
+ call->conn = candidate;
+ call->security_ix = candidate->security_ix;
+ _leave(" = 0 [exclusive %d]", candidate->debug_id);
+ return 0;
}
- /* We need to redo the search before attempting to add a new connection
- * lest we race with someone else adding a conflicting instance.
+ /* Publish the new connection for userspace to find. We need to redo
+ * the search before doing this lest we race with someone else adding a
+ * conflicting instance.
*/
_debug("search 2");
spin_lock(&local->client_conns_lock);
@@ -249,124 +377,711 @@ int rxrpc_connect_call(struct rxrpc_call *call,
parent = *pp;
conn = rb_entry(parent, struct rxrpc_connection, client_node);
+#define cmp(X) ((long)conn->params.X - (long)candidate->params.X)
diff = (cmp(peer) ?:
cmp(key) ?:
cmp(security_level));
- if (diff < 0)
+#undef cmp
+ if (diff < 0) {
pp = &(*pp)->rb_left;
- else if (diff > 0)
+ } else if (diff > 0) {
pp = &(*pp)->rb_right;
- else
- goto found_extant_conn;
+ } else {
+ if (rxrpc_may_reuse_conn(conn) &&
+ rxrpc_get_connection_maybe(conn))
+ goto found_extant_conn;
+ /* The old connection is from an outdated epoch. */
+ _debug("replace conn");
+ clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags);
+ rb_replace_node(&conn->client_node,
+ &candidate->client_node,
+ &local->client_conns);
+ trace_rxrpc_client(conn, -1, rxrpc_client_replace);
+ goto candidate_published;
+ }
}
- /* The second search also failed; simply add the new connection with
- * the new call in channel 0. Note that we need to take the channel
- * lock before dropping the client conn lock.
- */
_debug("new conn");
- set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
rb_link_node(&candidate->client_node, parent, pp);
rb_insert_color(&candidate->client_node, &local->client_conns);
-attached:
- conn = candidate;
- candidate = NULL;
- atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
- spin_lock(&conn->channel_lock);
+candidate_published:
+ set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
+ call->conn = candidate;
+ call->security_ix = candidate->security_ix;
spin_unlock(&local->client_conns_lock);
- chan = 0;
+ _leave(" = 0 [new %d]", candidate->debug_id);
+ return 0;
-found_channel:
- _debug("found chan");
- call->conn = conn;
- call->channel = chan;
- call->epoch = conn->proto.epoch;
- call->cid = conn->proto.cid | chan;
- call->call_id = ++conn->channels[chan].call_counter;
- conn->channels[chan].call_id = call->call_id;
- rcu_assign_pointer(conn->channels[chan].call, call);
+ /* We come here if we found a suitable connection already in existence.
+ * Discard any candidate we may have allocated, and try to get a
+ * channel on this one.
+ */
+found_extant_conn:
+ _debug("found conn");
+ spin_unlock(&local->client_conns_lock);
- _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id);
+ if (candidate) {
+ trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+ rxrpc_put_connection(candidate);
+ candidate = NULL;
+ }
+ spin_lock(&conn->channel_lock);
+ call->conn = conn;
+ call->security_ix = conn->security_ix;
+ list_add(&call->chan_wait_link, &conn->waiting_calls);
spin_unlock(&conn->channel_lock);
+ _leave(" = 0 [extant %d]", conn->debug_id);
+ return 0;
+
+error_peer:
rxrpc_put_peer(cp->peer);
cp->peer = NULL;
- _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
- return 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Activate a connection.
+ */
+static void rxrpc_activate_conn(struct rxrpc_connection *conn)
+{
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
+ conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
+ rxrpc_nr_active_client_conns++;
+ list_move_tail(&conn->cache_link, &rxrpc_active_client_conns);
+}
+
+/*
+ * Attempt to animate a connection for a new call.
+ *
+ * If it's not exclusive, the connection is in the endpoint tree, and we're in
+ * the conn's list of those waiting to grab a channel. There is, however, a
+ * limit on the number of live connections allowed at any one time, so we may
+ * have to wait for capacity to become available.
+ *
+ * Note that a connection on the waiting queue might *also* have active
+ * channels if it has been culled to make space and then re-requested by a new
+ * call.
+ */
+static void rxrpc_animate_client_conn(struct rxrpc_connection *conn)
+{
+ unsigned int nr_conns;
+
+ _enter("%d,%d", conn->debug_id, conn->cache_state);
+
+ if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE)
+ goto out;
+
+ spin_lock(&rxrpc_client_conn_cache_lock);
+
+ nr_conns = rxrpc_nr_client_conns;
+ if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_count);
+ rxrpc_nr_client_conns = nr_conns + 1;
+ }
+
+ switch (conn->cache_state) {
+ case RXRPC_CONN_CLIENT_ACTIVE:
+ case RXRPC_CONN_CLIENT_WAITING:
+ break;
+
+ case RXRPC_CONN_CLIENT_INACTIVE:
+ case RXRPC_CONN_CLIENT_CULLED:
+ case RXRPC_CONN_CLIENT_IDLE:
+ if (nr_conns >= rxrpc_max_client_connections)
+ goto wait_for_capacity;
+ goto activate_conn;
+
+ default:
+ BUG();
+ }
+
+out_unlock:
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+out:
+ _leave(" [%d]", conn->cache_state);
+ return;
+
+activate_conn:
+ _debug("activate");
+ rxrpc_activate_conn(conn);
+ goto out_unlock;
+
+wait_for_capacity:
+ _debug("wait");
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
+ conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
+ list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns);
+ goto out_unlock;
+}
+
+/*
+ * Deactivate a channel.
+ */
+static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn,
+ unsigned int channel)
+{
+ struct rxrpc_channel *chan = &conn->channels[channel];
+
+ rcu_assign_pointer(chan->call, NULL);
+ conn->active_chans &= ~(1 << channel);
+}
+
+/*
+ * Assign a channel to the call at the front of the queue and wake the call up.
+ * We don't increment the callNumber counter until this number has been exposed
+ * to the world.
+ */
+static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
+ unsigned int channel)
+{
+ struct rxrpc_channel *chan = &conn->channels[channel];
+ struct rxrpc_call *call = list_entry(conn->waiting_calls.next,
+ struct rxrpc_call, chan_wait_link);
+ u32 call_id = chan->call_counter + 1;
+
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
+
+ write_lock_bh(&call->state_lock);
+ call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+ write_unlock_bh(&call->state_lock);
+
+ rxrpc_see_call(call);
+ list_del_init(&call->chan_wait_link);
+ conn->active_chans |= 1 << channel;
+ call->peer = rxrpc_get_peer(conn->params.peer);
+ call->cid = conn->proto.cid | channel;
+ call->call_id = call_id;
+
+ _net("CONNECT call %08x:%08x as call %d on conn %d",
+ call->cid, call->call_id, call->debug_id, conn->debug_id);
- /* We found a potentially suitable connection already in existence. If
- * we can reuse it (ie. its usage count hasn't been reduced to 0 by the
- * reaper), discard any candidate we may have allocated, and try to get
- * a channel on this one, otherwise we have to replace it.
+ /* Paired with the read barrier in rxrpc_wait_for_channel(). This
+ * orders cid and epoch in the connection wrt to call_id without the
+ * need to take the channel_lock.
+ *
+ * We provisionally assign a callNumber at this point, but we don't
+ * confirm it until the call is about to be exposed.
+ *
+ * TODO: Pair with a barrier in the data_ready handler when that looks
+ * at the call ID through a connection channel.
*/
-found_extant_conn:
- _debug("found conn");
- if (!rxrpc_get_connection_maybe(conn)) {
- set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
- rb_replace_node(&conn->client_node,
- &candidate->client_node,
- &local->client_conns);
- clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags);
- goto attached;
+ smp_wmb();
+ chan->call_id = call_id;
+ rcu_assign_pointer(chan->call, call);
+ wake_up(&call->waitq);
+}
+
+/*
+ * Assign channels and callNumbers to waiting calls with channel_lock
+ * held by caller.
+ */
+static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn)
+{
+ u8 avail, mask;
+
+ switch (conn->cache_state) {
+ case RXRPC_CONN_CLIENT_ACTIVE:
+ mask = RXRPC_ACTIVE_CHANS_MASK;
+ break;
+ default:
+ return;
}
- spin_unlock(&local->client_conns_lock);
+ while (!list_empty(&conn->waiting_calls) &&
+ (avail = ~conn->active_chans,
+ avail &= mask,
+ avail != 0))
+ rxrpc_activate_one_channel(conn, __ffs(avail));
+}
+
+/*
+ * Assign channels and callNumbers to waiting calls.
+ */
+static void rxrpc_activate_channels(struct rxrpc_connection *conn)
+{
+ _enter("%d", conn->debug_id);
- rxrpc_put_connection(candidate);
+ trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+
+ if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
+ return;
+
+ spin_lock(&conn->channel_lock);
+ rxrpc_activate_channels_locked(conn);
+ spin_unlock(&conn->channel_lock);
+ _leave("");
+}
+
+/*
+ * Wait for a callNumber and a channel to be granted to a call.
+ */
+static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
+{
+ int ret = 0;
+
+ _enter("%d", call->debug_id);
+
+ if (!call->call_id) {
+ DECLARE_WAITQUEUE(myself, current);
- if (!atomic_add_unless(&conn->avail_chans, -1, 0)) {
if (!gfpflags_allow_blocking(gfp)) {
- rxrpc_put_connection(conn);
- _leave(" = -EAGAIN");
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
- add_wait_queue(&conn->channel_wq, &myself);
+ add_wait_queue_exclusive(&call->waitq, &myself);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (atomic_add_unless(&conn->avail_chans, -1, 0))
+ if (call->call_id)
+ break;
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
break;
- if (signal_pending(current))
- goto interrupted;
+ }
schedule();
}
- remove_wait_queue(&conn->channel_wq, &myself);
+ remove_wait_queue(&call->waitq, &myself);
__set_current_state(TASK_RUNNING);
}
- /* The connection allegedly now has a free channel and we can now
- * attach the call to it.
- */
+ /* Paired with the write barrier in rxrpc_activate_one_channel(). */
+ smp_rmb();
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * find a connection for a call
+ * - called in process context with IRQs enabled
+ */
+int rxrpc_connect_call(struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ int ret;
+
+ _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+
+ rxrpc_discard_expired_client_conns(NULL);
+ rxrpc_cull_active_client_conns();
+
+ ret = rxrpc_get_client_conn(call, cp, srx, gfp);
+ if (ret < 0)
+ return ret;
+
+ rxrpc_animate_client_conn(call->conn);
+ rxrpc_activate_channels(call->conn);
+
+ ret = rxrpc_wait_for_channel(call, gfp);
+ if (ret < 0)
+ rxrpc_disconnect_client_call(call);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Note that a connection is about to be exposed to the world. Once it is
+ * exposed, we maintain an extra ref on it that stops it from being summarily
+ * discarded before it's (a) had a chance to deal with retransmission and (b)
+ * had a chance at re-use (the per-connection security negotiation is
+ * expensive).
+ */
+static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
+ unsigned int channel)
+{
+ if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
+ rxrpc_get_connection(conn);
+ }
+}
+
+/*
+ * Note that a call, and thus a connection, is about to be exposed to the
+ * world.
+ */
+void rxrpc_expose_client_call(struct rxrpc_call *call)
+{
+ unsigned int channel = call->cid & RXRPC_CHANNELMASK;
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_channel *chan = &conn->channels[channel];
+
+ if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
+ /* Mark the call ID as being used. If the callNumber counter
+ * exceeds ~2 billion, we kill the connection after its
+ * outstanding calls have finished so that the counter doesn't
+ * wrap.
+ */
+ chan->call_counter++;
+ if (chan->call_counter >= INT_MAX)
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
+ rxrpc_expose_client_conn(conn, channel);
+ }
+}
+
+/*
+ * Disconnect a client call.
+ */
+void rxrpc_disconnect_client_call(struct rxrpc_call *call)
+{
+ unsigned int channel = call->cid & RXRPC_CHANNELMASK;
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_channel *chan = &conn->channels[channel];
+
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
+ call->conn = NULL;
+
spin_lock(&conn->channel_lock);
- for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
- if (!conn->channels[chan].call)
- goto found_channel;
- BUG();
+ /* Calls that have never actually been assigned a channel can simply be
+ * discarded. If the conn didn't get used either, it will follow
+ * immediately unless someone else grabs it in the meantime.
+ */
+ if (!list_empty(&call->chan_wait_link)) {
+ _debug("call is waiting");
+ ASSERTCMP(call->call_id, ==, 0);
+ ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
+ list_del_init(&call->chan_wait_link);
+
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
+
+ /* We must deactivate or idle the connection if it's now
+ * waiting for nothing.
+ */
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING &&
+ list_empty(&conn->waiting_calls) &&
+ !conn->active_chans)
+ goto idle_connection;
+ goto out;
+ }
+
+ ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
+
+ /* If a client call was exposed to the world, we save the result for
+ * retransmission.
+ *
+ * We use a barrier here so that the call number and abort code can be
+ * read without needing to take a lock.
+ *
+ * TODO: Make the incoming packet handler check this and handle
+ * terminal retransmission without requiring access to the call.
+ */
+ if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
+ _debug("exposed %u,%u", call->call_id, call->abort_code);
+ __rxrpc_disconnect_call(conn, call);
+ }
+
+ /* See if we can pass the channel directly to another call. */
+ if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
+ !list_empty(&conn->waiting_calls)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
+ rxrpc_activate_one_channel(conn, channel);
+ goto out_2;
+ }
+
+ /* Things are more complex and we need the cache lock. We might be
+ * able to simply idle the conn or it might now be lurking on the wait
+ * list. It might even get moved back to the active list whilst we're
+ * waiting for the lock.
+ */
+ spin_lock(&rxrpc_client_conn_cache_lock);
+
+ switch (conn->cache_state) {
+ case RXRPC_CONN_CLIENT_ACTIVE:
+ if (list_empty(&conn->waiting_calls)) {
+ rxrpc_deactivate_one_channel(conn, channel);
+ if (!conn->active_chans) {
+ rxrpc_nr_active_client_conns--;
+ goto idle_connection;
+ }
+ goto out;
+ }
+
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
+ rxrpc_activate_one_channel(conn, channel);
+ goto out;
-interrupted:
- remove_wait_queue(&conn->channel_wq, &myself);
- __set_current_state(TASK_RUNNING);
+ case RXRPC_CONN_CLIENT_CULLED:
+ rxrpc_deactivate_one_channel(conn, channel);
+ ASSERT(list_empty(&conn->waiting_calls));
+ if (!conn->active_chans)
+ goto idle_connection;
+ goto out;
+
+ case RXRPC_CONN_CLIENT_WAITING:
+ rxrpc_deactivate_one_channel(conn, channel);
+ goto out;
+
+ default:
+ BUG();
+ }
+
+out:
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+out_2:
+ spin_unlock(&conn->channel_lock);
rxrpc_put_connection(conn);
- rxrpc_put_peer(cp->peer);
- cp->peer = NULL;
- _leave(" = -ERESTARTSYS");
- return -ERESTARTSYS;
+ _leave("");
+ return;
+
+idle_connection:
+ /* As no channels remain active, the connection gets deactivated
+ * immediately or moved to the idle list for a short while.
+ */
+ if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
+ conn->idle_timestamp = jiffies;
+ conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
+ list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns);
+ if (rxrpc_idle_client_conns.next == &conn->cache_link &&
+ !rxrpc_kill_all_client_conns)
+ queue_delayed_work(rxrpc_workqueue,
+ &rxrpc_client_conn_reap,
+ rxrpc_conn_idle_client_expiry);
+ } else {
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
+ conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
+ list_del_init(&conn->cache_link);
+ }
+ goto out;
}
/*
- * Remove a client connection from the local endpoint's tree, thereby removing
- * it as a target for reuse for new client calls.
+ * Clean up a dead client connection.
*/
-void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn)
+static struct rxrpc_connection *
+rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
{
+ struct rxrpc_connection *next = NULL;
struct rxrpc_local *local = conn->params.local;
+ unsigned int nr_conns;
- spin_lock(&local->client_conns_lock);
- if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags))
- rb_erase(&conn->client_node, &local->client_conns);
- spin_unlock(&local->client_conns_lock);
+ trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+
+ if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
+ spin_lock(&local->client_conns_lock);
+ if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
+ &conn->flags))
+ rb_erase(&conn->client_node, &local->client_conns);
+ spin_unlock(&local->client_conns_lock);
+ }
rxrpc_put_client_connection_id(conn);
+
+ ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
+
+ if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ nr_conns = --rxrpc_nr_client_conns;
+
+ if (nr_conns < rxrpc_max_client_connections &&
+ !list_empty(&rxrpc_waiting_client_conns)) {
+ next = list_entry(rxrpc_waiting_client_conns.next,
+ struct rxrpc_connection, cache_link);
+ rxrpc_get_connection(next);
+ rxrpc_activate_conn(next);
+ }
+
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+ }
+
+ rxrpc_kill_connection(conn);
+ if (next)
+ rxrpc_activate_channels(next);
+
+ /* We need to get rid of the temporary ref we took upon next, but we
+ * can't call rxrpc_put_connection() recursively.
+ */
+ return next;
+}
+
+/*
+ * Clean up a dead client connections.
+ */
+void rxrpc_put_client_conn(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n;
+
+ do {
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
+ if (n > 0)
+ return;
+ ASSERTCMP(n, >=, 0);
+
+ conn = rxrpc_put_one_client_conn(conn);
+ } while (conn);
+}
+
+/*
+ * Kill the longest-active client connections to make room for new ones.
+ */
+static void rxrpc_cull_active_client_conns(void)
+{
+ struct rxrpc_connection *conn;
+ unsigned int nr_conns = rxrpc_nr_client_conns;
+ unsigned int nr_active, limit;
+
+ _enter("");
+
+ ASSERTCMP(nr_conns, >=, 0);
+ if (nr_conns < rxrpc_max_client_connections) {
+ _leave(" [ok]");
+ return;
+ }
+ limit = rxrpc_reap_client_connections;
+
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ nr_active = rxrpc_nr_active_client_conns;
+
+ while (nr_active > limit) {
+ ASSERT(!list_empty(&rxrpc_active_client_conns));
+ conn = list_entry(rxrpc_active_client_conns.next,
+ struct rxrpc_connection, cache_link);
+ ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE);
+
+ if (list_empty(&conn->waiting_calls)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
+ conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
+ list_del_init(&conn->cache_link);
+ } else {
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
+ conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
+ list_move_tail(&conn->cache_link,
+ &rxrpc_waiting_client_conns);
+ }
+
+ nr_active--;
+ }
+
+ rxrpc_nr_active_client_conns = nr_active;
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+ ASSERTCMP(nr_active, >=, 0);
+ _leave(" [culled]");
+}
+
+/*
+ * Discard expired client connections from the idle list. Each conn in the
+ * idle list has been exposed and holds an extra ref because of that.
+ *
+ * This may be called from conn setup or from a work item so cannot be
+ * considered non-reentrant.
+ */
+static void rxrpc_discard_expired_client_conns(struct work_struct *work)
+{
+ struct rxrpc_connection *conn;
+ unsigned long expiry, conn_expires_at, now;
+ unsigned int nr_conns;
+ bool did_discard = false;
+
+ _enter("%c", work ? 'w' : 'n');
+
+ if (list_empty(&rxrpc_idle_client_conns)) {
+ _leave(" [empty]");
+ return;
+ }
+
+ /* Don't double up on the discarding */
+ if (!spin_trylock(&rxrpc_client_conn_discard_mutex)) {
+ _leave(" [already]");
+ return;
+ }
+
+ /* We keep an estimate of what the number of conns ought to be after
+ * we've discarded some so that we don't overdo the discarding.
+ */
+ nr_conns = rxrpc_nr_client_conns;
+
+next:
+ spin_lock(&rxrpc_client_conn_cache_lock);
+
+ if (list_empty(&rxrpc_idle_client_conns))
+ goto out;
+
+ conn = list_entry(rxrpc_idle_client_conns.next,
+ struct rxrpc_connection, cache_link);
+ ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags));
+
+ if (!rxrpc_kill_all_client_conns) {
+ /* If the number of connections is over the reap limit, we
+ * expedite discard by reducing the expiry timeout. We must,
+ * however, have at least a short grace period to be able to do
+ * final-ACK or ABORT retransmission.
+ */
+ expiry = rxrpc_conn_idle_client_expiry;
+ if (nr_conns > rxrpc_reap_client_connections)
+ expiry = rxrpc_conn_idle_client_fast_expiry;
+
+ conn_expires_at = conn->idle_timestamp + expiry;
+
+ now = READ_ONCE(jiffies);
+ if (time_after(conn_expires_at, now))
+ goto not_yet_expired;
+ }
+
+ trace_rxrpc_client(conn, -1, rxrpc_client_discard);
+ if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
+ BUG();
+ conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
+ list_del_init(&conn->cache_link);
+
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+
+ /* When we cleared the EXPOSED flag, we took on responsibility for the
+ * reference that that had on the usage count. We deal with that here.
+ * If someone re-sets the flag and re-gets the ref, that's fine.
+ */
+ rxrpc_put_connection(conn);
+ did_discard = true;
+ nr_conns--;
+ goto next;
+
+not_yet_expired:
+ /* The connection at the front of the queue hasn't yet expired, so
+ * schedule the work item for that point if we discarded something.
+ *
+ * We don't worry if the work item is already scheduled - it can look
+ * after rescheduling itself at a later time. We could cancel it, but
+ * then things get messier.
+ */
+ _debug("not yet");
+ if (!rxrpc_kill_all_client_conns)
+ queue_delayed_work(rxrpc_workqueue,
+ &rxrpc_client_conn_reap,
+ conn_expires_at - now);
+
+out:
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+ spin_unlock(&rxrpc_client_conn_discard_mutex);
+ _leave("");
+}
+
+/*
+ * Preemptively destroy all the client connection records rather than waiting
+ * for them to time out
+ */
+void __exit rxrpc_destroy_all_client_connections(void)
+{
+ _enter("");
+
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ rxrpc_kill_all_client_conns = true;
+ spin_unlock(&rxrpc_client_conn_cache_lock);
+
+ cancel_delayed_work(&rxrpc_client_conn_reap);
+
+ if (!queue_delayed_work(rxrpc_workqueue, &rxrpc_client_conn_reap, 0))
+ _debug("destroy: queue failed");
+
+ _leave("");
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index cee0f35bc1cf..3f9d8d7ec632 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -15,20 +15,128 @@
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/errqueue.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
#include "ar-internal.h"
/*
+ * Retransmit terminal ACK or ABORT of the previous call.
+ */
+static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_channel *chan;
+ struct msghdr msg;
+ struct kvec iov;
+ struct {
+ struct rxrpc_wire_header whdr;
+ union {
+ struct {
+ __be32 code;
+ } abort;
+ struct {
+ struct rxrpc_ackpacket ack;
+ u8 padding[3];
+ struct rxrpc_ackinfo info;
+ };
+ };
+ } __attribute__((packed)) pkt;
+ size_t len;
+ u32 serial, mtu, call_id;
+
+ _enter("%d", conn->debug_id);
+
+ chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK];
+
+ /* If the last call got moved on whilst we were waiting to run, just
+ * ignore this packet.
+ */
+ call_id = READ_ONCE(chan->last_call);
+ /* Sync with __rxrpc_disconnect_call() */
+ smp_rmb();
+ if (call_id != sp->hdr.callNumber)
+ return;
+
+ msg.msg_name = &conn->params.peer->srx.transport;
+ msg.msg_namelen = conn->params.peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ pkt.whdr.epoch = htonl(sp->hdr.epoch);
+ pkt.whdr.cid = htonl(sp->hdr.cid);
+ pkt.whdr.callNumber = htonl(sp->hdr.callNumber);
+ pkt.whdr.seq = 0;
+ pkt.whdr.type = chan->last_type;
+ pkt.whdr.flags = conn->out_clientflag;
+ pkt.whdr.userStatus = 0;
+ pkt.whdr.securityIndex = conn->security_ix;
+ pkt.whdr._rsvd = 0;
+ pkt.whdr.serviceId = htons(chan->last_service_id);
+
+ len = sizeof(pkt.whdr);
+ switch (chan->last_type) {
+ case RXRPC_PACKET_TYPE_ABORT:
+ pkt.abort.code = htonl(chan->last_abort);
+ len += sizeof(pkt.abort);
+ break;
+
+ case RXRPC_PACKET_TYPE_ACK:
+ mtu = conn->params.peer->if_mtu;
+ mtu -= conn->params.peer->hdrsize;
+ pkt.ack.bufferSpace = 0;
+ pkt.ack.maxSkew = htons(skb->priority);
+ pkt.ack.firstPacket = htonl(chan->last_seq);
+ pkt.ack.previousPacket = htonl(chan->last_seq - 1);
+ pkt.ack.serial = htonl(sp->hdr.serial);
+ pkt.ack.reason = RXRPC_ACK_DUPLICATE;
+ pkt.ack.nAcks = 0;
+ pkt.info.rxMTU = htonl(rxrpc_rx_mtu);
+ pkt.info.maxMTU = htonl(mtu);
+ pkt.info.rwind = htonl(rxrpc_rx_window_size);
+ pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+ pkt.whdr.flags |= RXRPC_SLOW_START_OK;
+ len += sizeof(pkt.ack) + sizeof(pkt.info);
+ break;
+ }
+
+ /* Resync with __rxrpc_disconnect_call() and check that the last call
+ * didn't get advanced whilst we were filling out the packets.
+ */
+ smp_rmb();
+ if (READ_ONCE(chan->last_call) != call_id)
+ return;
+
+ iov.iov_base = &pkt;
+ iov.iov_len = len;
+
+ serial = atomic_inc_return(&conn->serial);
+ pkt.whdr.serial = htonl(serial);
+
+ switch (chan->last_type) {
+ case RXRPC_PACKET_TYPE_ABORT:
+ _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
+ break;
+ case RXRPC_PACKET_TYPE_ACK:
+ trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
+ RXRPC_ACK_DUPLICATE, 0);
+ _proto("Tx ACK %%%u [re]", serial);
+ break;
+ }
+
+ kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len);
+ _leave("");
+ return;
+}
+
+/*
* pass a connection-level abort onto all calls on that connection
*/
-static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
- u32 abort_code)
+static void rxrpc_abort_calls(struct rxrpc_connection *conn,
+ enum rxrpc_call_completion compl,
+ u32 abort_code, int error)
{
struct rxrpc_call *call;
int i;
@@ -41,19 +149,15 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
call = rcu_dereference_protected(
conn->channels[i].call,
lockdep_is_held(&conn->channel_lock));
- write_lock_bh(&call->state_lock);
- if (call->state <= RXRPC_CALL_COMPLETE) {
- call->state = state;
- if (state == RXRPC_CALL_LOCALLY_ABORTED) {
- call->local_abort = conn->local_abort;
- set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
- } else {
- call->remote_abort = conn->remote_abort;
- set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- }
- rxrpc_queue_call(call);
+ if (call) {
+ if (compl == RXRPC_CALL_LOCALLY_ABORTED)
+ trace_rxrpc_abort("CON", call->cid,
+ call->call_id, 0,
+ abort_code, error);
+ if (rxrpc_set_call_completion(call, compl,
+ abort_code, error))
+ rxrpc_notify_socket(call);
}
- write_unlock_bh(&call->state_lock);
}
spin_unlock(&conn->channel_lock);
@@ -78,17 +182,16 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
/* generate a connection-level abort */
spin_lock_bh(&conn->state_lock);
- if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) {
- conn->state = RXRPC_CONN_LOCALLY_ABORTED;
- conn->error = error;
- spin_unlock_bh(&conn->state_lock);
- } else {
+ if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
spin_unlock_bh(&conn->state_lock);
_leave(" = 0 [already dead]");
return 0;
}
- rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code);
+ conn->state = RXRPC_CONN_LOCALLY_ABORTED;
+ spin_unlock_bh(&conn->state_lock);
+
+ rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error);
msg.msg_name = &conn->params.peer->srx.transport;
msg.msg_namelen = conn->params.peer->srx.transport_len;
@@ -132,17 +235,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
/*
* mark a call as being on a now-secured channel
- * - must be called with softirqs disabled
+ * - must be called with BH's disabled.
*/
static void rxrpc_call_is_secure(struct rxrpc_call *call)
{
_enter("%p", call);
if (call) {
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
- rxrpc_queue_call(call);
- read_unlock(&call->state_lock);
+ write_lock_bh(&call->state_lock);
+ if (call->state == RXRPC_CALL_SERVER_SECURING) {
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ rxrpc_notify_socket(call);
+ }
+ write_unlock_bh(&call->state_lock);
}
}
@@ -159,22 +263,28 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
int loop, ret;
if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
- kleave(" = -ECONNABORTED [%u]", conn->state);
+ _leave(" = -ECONNABORTED [%u]", conn->state);
return -ECONNABORTED;
}
_enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_DATA:
+ case RXRPC_PACKET_TYPE_ACK:
+ rxrpc_conn_retransmit_call(conn, skb);
+ return 0;
+
case RXRPC_PACKET_TYPE_ABORT:
- if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &wtmp, sizeof(wtmp)) < 0)
return -EPROTO;
abort_code = ntohl(wtmp);
_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
- abort_code);
+ abort_code, ECONNABORTED);
return -ECONNABORTED;
case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -199,14 +309,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
conn->state = RXRPC_CONN_SERVICE;
+ spin_unlock(&conn->state_lock);
for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
rxrpc_call_is_secure(
rcu_dereference_protected(
conn->channels[loop].call,
lockdep_is_held(&conn->channel_lock)));
+ } else {
+ spin_unlock(&conn->state_lock);
}
- spin_unlock(&conn->state_lock);
spin_unlock(&conn->channel_lock);
return 0;
@@ -269,7 +381,7 @@ void rxrpc_process_connection(struct work_struct *work)
u32 abort_code = RX_PROTOCOL_ERROR;
int ret;
- _enter("{%d}", conn->debug_id);
+ rxrpc_see_connection(conn);
if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
rxrpc_secure_connection(conn);
@@ -277,6 +389,7 @@ void rxrpc_process_connection(struct work_struct *work)
/* go through the conn-level event packets, releasing the ref on this
* connection that each one has when we've finished with it */
while ((skb = skb_dequeue(&conn->rx_queue))) {
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
ret = rxrpc_process_event(conn, skb, &abort_code);
switch (ret) {
case -EPROTO:
@@ -287,7 +400,7 @@ void rxrpc_process_connection(struct work_struct *work)
goto requeue_and_leave;
case -ECONNABORTED:
default:
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
break;
}
}
@@ -304,91 +417,7 @@ requeue_and_leave:
protocol_error:
if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
goto requeue_and_leave;
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
_leave(" [EPROTO]");
goto out;
}
-
-/*
- * put a packet up for transport-level abort
- */
-void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
-{
- CHECK_SLAB_OKAY(&local->usage);
-
- skb_queue_tail(&local->reject_queue, skb);
- rxrpc_queue_local(local);
-}
-
-/*
- * reject packets through the local endpoint
- */
-void rxrpc_reject_packets(struct rxrpc_local *local)
-{
- union {
- struct sockaddr sa;
- struct sockaddr_in sin;
- } sa;
- struct rxrpc_skb_priv *sp;
- struct rxrpc_wire_header whdr;
- struct sk_buff *skb;
- struct msghdr msg;
- struct kvec iov[2];
- size_t size;
- __be32 code;
-
- _enter("%d", local->debug_id);
-
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
- iov[1].iov_base = &code;
- iov[1].iov_len = sizeof(code);
- size = sizeof(whdr) + sizeof(code);
-
- msg.msg_name = &sa;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
-
- memset(&sa, 0, sizeof(sa));
- sa.sa.sa_family = local->srx.transport.family;
- switch (sa.sa.sa_family) {
- case AF_INET:
- msg.msg_namelen = sizeof(sa.sin);
- break;
- default:
- msg.msg_namelen = 0;
- break;
- }
-
- memset(&whdr, 0, sizeof(whdr));
- whdr.type = RXRPC_PACKET_TYPE_ABORT;
-
- while ((skb = skb_dequeue(&local->reject_queue))) {
- sp = rxrpc_skb(skb);
- switch (sa.sa.sa_family) {
- case AF_INET:
- sa.sin.sin_port = udp_hdr(skb)->source;
- sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
- code = htonl(skb->priority);
-
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.serviceId = htons(sp->hdr.serviceId);
- whdr.flags = sp->hdr.flags;
- whdr.flags ^= RXRPC_CLIENT_INITIATED;
- whdr.flags &= RXRPC_CLIENT_INITIATED;
-
- kernel_sendmsg(local->socket, &msg, iov, 2, size);
- break;
-
- default:
- break;
- }
-
- rxrpc_free_skb(skb);
- }
-
- _leave("");
-}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 896d84493a05..e1e83af47866 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -1,6 +1,6 @@
-/* RxRPC virtual connection handler
+/* RxRPC virtual connection handler, common bits.
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -15,8 +15,6 @@
#include <linux/slab.h>
#include <linux/net.h>
#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
#include "ar-internal.h"
/*
@@ -27,9 +25,12 @@ unsigned int rxrpc_connection_expiry = 10 * 60;
static void rxrpc_connection_reaper(struct work_struct *work);
LIST_HEAD(rxrpc_connections);
+LIST_HEAD(rxrpc_connection_proc_list);
DEFINE_RWLOCK(rxrpc_connection_lock);
static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
+static void rxrpc_destroy_connection(struct rcu_head *);
+
/*
* allocate a new connection
*/
@@ -41,21 +42,18 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
if (conn) {
+ INIT_LIST_HEAD(&conn->cache_link);
spin_lock_init(&conn->channel_lock);
- init_waitqueue_head(&conn->channel_wq);
+ INIT_LIST_HEAD(&conn->waiting_calls);
INIT_WORK(&conn->processor, &rxrpc_process_connection);
+ INIT_LIST_HEAD(&conn->proc_link);
INIT_LIST_HEAD(&conn->link);
skb_queue_head_init(&conn->rx_queue);
conn->security = &rxrpc_no_security;
spin_lock_init(&conn->state_lock);
- /* We maintain an extra ref on the connection whilst it is
- * on the rxrpc_connections list.
- */
- atomic_set(&conn->usage, 2);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
- atomic_set(&conn->avail_chans, RXRPC_MAXCALLS);
conn->size_align = 4;
- conn->header_size = sizeof(struct rxrpc_wire_header);
+ conn->idle_timestamp = jiffies;
}
_leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
@@ -135,6 +133,16 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
srx.transport.sin.sin_addr.s_addr)
goto not_found;
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ if (peer->srx.transport.sin6.sin6_port !=
+ srx.transport.sin6.sin6_port ||
+ memcmp(&peer->srx.transport.sin6.sin6_addr,
+ &srx.transport.sin6.sin6_addr,
+ sizeof(struct in6_addr)) != 0)
+ goto not_found;
+ break;
+#endif
default:
BUG();
}
@@ -153,25 +161,32 @@ not_found:
* terminates. The caller must hold the channel_lock and must release the
* call's ref on the connection.
*/
-void __rxrpc_disconnect_call(struct rxrpc_call *call)
+void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
+ struct rxrpc_call *call)
{
- struct rxrpc_connection *conn = call->conn;
- struct rxrpc_channel *chan = &conn->channels[call->channel];
+ struct rxrpc_channel *chan =
+ &conn->channels[call->cid & RXRPC_CHANNELMASK];
- _enter("%d,%d", conn->debug_id, call->channel);
+ _enter("%d,%x", conn->debug_id, call->cid);
if (rcu_access_pointer(chan->call) == call) {
/* Save the result of the call so that we can repeat it if necessary
* through the channel, whilst disposing of the actual call record.
*/
- chan->last_result = call->local_abort;
+ chan->last_service_id = call->service_id;
+ if (call->abort_code) {
+ chan->last_abort = call->abort_code;
+ chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+ } else {
+ chan->last_seq = call->rx_hard_ack;
+ chan->last_type = RXRPC_PACKET_TYPE_ACK;
+ }
+ /* Sync with rxrpc_conn_retransmit(). */
smp_wmb();
chan->last_call = chan->call_id;
chan->call_id = chan->call_counter;
rcu_assign_pointer(chan->call, NULL);
- atomic_inc(&conn->avail_chans);
- wake_up(&conn->channel_wq);
}
_leave("");
@@ -185,34 +200,122 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
{
struct rxrpc_connection *conn = call->conn;
+ spin_lock_bh(&conn->params.peer->lock);
+ hlist_del_init(&call->error_link);
+ spin_unlock_bh(&conn->params.peer->lock);
+
+ if (rxrpc_is_client_call(call))
+ return rxrpc_disconnect_client_call(call);
+
spin_lock(&conn->channel_lock);
- __rxrpc_disconnect_call(call);
+ __rxrpc_disconnect_call(conn, call);
spin_unlock(&conn->channel_lock);
call->conn = NULL;
+ conn->idle_timestamp = jiffies;
rxrpc_put_connection(conn);
}
/*
- * release a virtual connection
+ * Kill off a connection.
*/
-void rxrpc_put_connection(struct rxrpc_connection *conn)
+void rxrpc_kill_connection(struct rxrpc_connection *conn)
{
- if (!conn)
- return;
+ ASSERT(!rcu_access_pointer(conn->channels[0].call) &&
+ !rcu_access_pointer(conn->channels[1].call) &&
+ !rcu_access_pointer(conn->channels[2].call) &&
+ !rcu_access_pointer(conn->channels[3].call));
+ ASSERT(list_empty(&conn->cache_link));
- _enter("%p{u=%d,d=%d}",
- conn, atomic_read(&conn->usage), conn->debug_id);
+ write_lock(&rxrpc_connection_lock);
+ list_del_init(&conn->proc_link);
+ write_unlock(&rxrpc_connection_lock);
- ASSERTCMP(atomic_read(&conn->usage), >, 1);
+ /* Drain the Rx queue. Note that even though we've unpublished, an
+ * incoming packet could still be being added to our Rx queue, so we
+ * will need to drain it again in the RCU cleanup handler.
+ */
+ rxrpc_purge_queue(&conn->rx_queue);
- conn->put_time = ktime_get_seconds();
- if (atomic_dec_return(&conn->usage) == 1) {
- _debug("zombie");
- rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ /* Leave final destruction to RCU. The connection processor work item
+ * must carry a ref on the connection to prevent us getting here whilst
+ * it is queued or running.
+ */
+ call_rcu(&conn->rcu, rxrpc_destroy_connection);
+}
+
+/*
+ * Queue a connection's work processor, getting a ref to pass to the work
+ * queue.
+ */
+bool rxrpc_queue_conn(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n = __atomic_add_unless(&conn->usage, 1, 0);
+ if (n == 0)
+ return false;
+ if (rxrpc_queue_work(&conn->processor))
+ trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
+ else
+ rxrpc_put_connection(conn);
+ return true;
+}
+
+/*
+ * Note the re-emergence of a connection.
+ */
+void rxrpc_see_connection(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ if (conn) {
+ int n = atomic_read(&conn->usage);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
+ }
+}
+
+/*
+ * Get a ref on a connection.
+ */
+void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(&conn->usage);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
+}
+
+/*
+ * Try to get a ref on a connection.
+ */
+struct rxrpc_connection *
+rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+
+ if (conn) {
+ int n = __atomic_add_unless(&conn->usage, 1, 0);
+ if (n > 0)
+ trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
+ else
+ conn = NULL;
}
+ return conn;
+}
- _leave("");
+/*
+ * Release a service connection
+ */
+void rxrpc_put_service_conn(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n;
+
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
+ ASSERTCMP(n, >=, 0);
+ if (n == 0)
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
}
/*
@@ -242,19 +345,19 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
}
/*
- * reap dead connections
+ * reap dead service connections
*/
static void rxrpc_connection_reaper(struct work_struct *work)
{
struct rxrpc_connection *conn, *_p;
- unsigned long reap_older_than, earliest, put_time, now;
+ unsigned long reap_older_than, earliest, idle_timestamp, now;
LIST_HEAD(graveyard);
_enter("");
- now = ktime_get_seconds();
- reap_older_than = now - rxrpc_connection_expiry;
+ now = jiffies;
+ reap_older_than = now - rxrpc_connection_expiry * HZ;
earliest = ULONG_MAX;
write_lock(&rxrpc_connection_lock);
@@ -262,11 +365,17 @@ static void rxrpc_connection_reaper(struct work_struct *work)
ASSERTCMP(atomic_read(&conn->usage), >, 0);
if (likely(atomic_read(&conn->usage) > 1))
continue;
+ if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
+ continue;
+
+ idle_timestamp = READ_ONCE(conn->idle_timestamp);
+ _debug("reap CONN %d { u=%d,t=%ld }",
+ conn->debug_id, atomic_read(&conn->usage),
+ (long)reap_older_than - (long)idle_timestamp);
- put_time = READ_ONCE(conn->put_time);
- if (time_after(put_time, reap_older_than)) {
- if (time_before(put_time, earliest))
- earliest = put_time;
+ if (time_after(idle_timestamp, reap_older_than)) {
+ if (time_before(idle_timestamp, earliest))
+ earliest = idle_timestamp;
continue;
}
@@ -277,7 +386,7 @@ static void rxrpc_connection_reaper(struct work_struct *work)
continue;
if (rxrpc_conn_is_client(conn))
- rxrpc_unpublish_client_conn(conn);
+ BUG();
else
rxrpc_unpublish_service_conn(conn);
@@ -287,9 +396,9 @@ static void rxrpc_connection_reaper(struct work_struct *work)
if (earliest != ULONG_MAX) {
_debug("reschedule reaper %ld", (long) earliest - now);
- ASSERTCMP(earliest, >, now);
+ ASSERT(time_after(earliest, now));
rxrpc_queue_delayed_work(&rxrpc_connection_reap,
- (earliest - now) * HZ);
+ earliest - now);
}
while (!list_empty(&graveyard)) {
@@ -298,16 +407,15 @@ static void rxrpc_connection_reaper(struct work_struct *work)
list_del_init(&conn->link);
ASSERTCMP(atomic_read(&conn->usage), ==, 0);
- skb_queue_purge(&conn->rx_queue);
- call_rcu(&conn->rcu, rxrpc_destroy_connection);
+ rxrpc_kill_connection(conn);
}
_leave("");
}
/*
- * preemptively destroy all the connection records rather than waiting for them
- * to time out
+ * preemptively destroy all the service connection records rather than
+ * waiting for them to time out
*/
void __exit rxrpc_destroy_all_connections(void)
{
@@ -316,6 +424,8 @@ void __exit rxrpc_destroy_all_connections(void)
_enter("");
+ rxrpc_destroy_all_client_connections();
+
rxrpc_connection_expiry = 0;
cancel_delayed_work(&rxrpc_connection_reap);
rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
@@ -330,6 +440,8 @@ void __exit rxrpc_destroy_all_connections(void)
write_unlock(&rxrpc_connection_lock);
BUG_ON(leak);
+ ASSERT(list_empty(&rxrpc_connection_proc_list));
+
/* Make sure the local and peer records pinned by any dying connections
* are released.
*/
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index fd9027ccba8f..eef551f40dc2 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -65,9 +65,8 @@ done:
* Insert a service connection into a peer's tree, thereby making it a target
* for incoming packets.
*/
-static struct rxrpc_connection *
-rxrpc_publish_service_conn(struct rxrpc_peer *peer,
- struct rxrpc_connection *conn)
+static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
+ struct rxrpc_connection *conn)
{
struct rxrpc_connection *cursor = NULL;
struct rxrpc_conn_proto k = conn->proto;
@@ -96,7 +95,7 @@ conn_published:
set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
write_sequnlock_bh(&peer->service_conn_lock);
_leave(" = %d [new]", conn->debug_id);
- return conn;
+ return;
found_extant_conn:
if (atomic_read(&cursor->usage) == 0)
@@ -119,100 +118,58 @@ replace_old_connection:
}
/*
- * get a record of an incoming connection
+ * Preallocate a service connection. The connection is placed on the proc and
+ * reap lists so that we don't have to get the lock from BH context.
*/
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local,
- struct sockaddr_rxrpc *srx,
- struct sk_buff *skb)
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
{
- struct rxrpc_connection *conn;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_peer *peer;
- const char *new = "old";
-
- _enter("");
+ struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp);
- peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
- if (!peer) {
- _debug("no peer");
- return ERR_PTR(-EBUSY);
- }
+ if (conn) {
+ /* We maintain an extra ref on the connection whilst it is on
+ * the rxrpc_connections list.
+ */
+ conn->state = RXRPC_CONN_SERVICE_PREALLOC;
+ atomic_set(&conn->usage, 2);
- ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED);
-
- rcu_read_lock();
- peer = rxrpc_lookup_peer_rcu(local, srx);
- if (peer) {
- conn = rxrpc_find_service_conn_rcu(peer, skb);
- if (conn) {
- if (sp->hdr.securityIndex != conn->security_ix)
- goto security_mismatch_rcu;
- if (rxrpc_get_connection_maybe(conn))
- goto found_extant_connection_rcu;
-
- /* The conn has expired but we can't remove it without
- * the appropriate lock, so we attempt to replace it
- * when we have a new candidate.
- */
- }
+ write_lock(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
+ write_unlock(&rxrpc_connection_lock);
- if (!rxrpc_get_peer_maybe(peer))
- peer = NULL;
+ trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ atomic_read(&conn->usage),
+ __builtin_return_address(0));
}
- rcu_read_unlock();
- if (!peer) {
- peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
- if (!peer)
- goto enomem;
- }
+ return conn;
+}
- /* We don't have a matching record yet. */
- conn = rxrpc_alloc_connection(GFP_NOIO);
- if (!conn)
- goto enomem_peer;
+/*
+ * Set up an incoming connection. This is called in BH context with the RCU
+ * read lock held.
+ */
+void rxrpc_new_incoming_connection(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ _enter("");
conn->proto.epoch = sp->hdr.epoch;
conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK;
- conn->params.local = local;
- conn->params.peer = peer;
conn->params.service_id = sp->hdr.serviceId;
conn->security_ix = sp->hdr.securityIndex;
conn->out_clientflag = 0;
- conn->state = RXRPC_CONN_SERVICE;
- if (conn->params.service_id)
+ if (conn->security_ix)
conn->state = RXRPC_CONN_SERVICE_UNSECURED;
-
- rxrpc_get_local(local);
-
- write_lock(&rxrpc_connection_lock);
- list_add_tail(&conn->link, &rxrpc_connections);
- write_unlock(&rxrpc_connection_lock);
+ else
+ conn->state = RXRPC_CONN_SERVICE;
/* Make the connection a target for incoming packets. */
- rxrpc_publish_service_conn(peer, conn);
-
- new = "new";
-
-success:
- _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
- _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
- return conn;
-
-found_extant_connection_rcu:
- rcu_read_unlock();
- goto success;
-
-security_mismatch_rcu:
- rcu_read_unlock();
- _leave(" = -EKEYREJECTED");
- return ERR_PTR(-EKEYREJECTED);
+ rxrpc_publish_service_conn(conn->params.peer, conn);
-enomem_peer:
- rxrpc_put_peer(peer);
-enomem:
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
+ _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid);
}
/*
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 70bb77818dea..1d87b5453ef7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1,6 +1,6 @@
/* RxRPC packet reception
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -27,550 +27,948 @@
#include <net/net_namespace.h>
#include "ar-internal.h"
+static void rxrpc_proto_abort(const char *why,
+ struct rxrpc_call *call, rxrpc_seq_t seq)
+{
+ if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) {
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+}
+
/*
- * queue a packet for recvmsg to pass to userspace
- * - the caller must hold a lock on call->lock
- * - must not be called with interrupts disabled (sk_filter() disables BH's)
- * - eats the packet whether successful or not
- * - there must be just one reference to the packet, which the caller passes to
- * this function
+ * Do TCP-style congestion management [RFC 5681].
*/
-int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
- bool force, bool terminal)
+static void rxrpc_congestion_management(struct rxrpc_call *call,
+ struct sk_buff *skb,
+ struct rxrpc_ack_summary *summary,
+ rxrpc_serial_t acked_serial)
{
- struct rxrpc_skb_priv *sp;
- struct rxrpc_sock *rx = call->socket;
- struct sock *sk;
- int ret;
+ enum rxrpc_congest_change change = rxrpc_cong_no_change;
+ unsigned int cumulative_acks = call->cong_cumul_acks;
+ unsigned int cwnd = call->cong_cwnd;
+ bool resend = false;
+
+ summary->flight_size =
+ (call->tx_top - call->tx_hard_ack) - summary->nr_acks;
+
+ if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
+ summary->retrans_timeo = true;
+ call->cong_ssthresh = max_t(unsigned int,
+ summary->flight_size / 2, 2);
+ cwnd = 1;
+ if (cwnd >= call->cong_ssthresh &&
+ call->cong_mode == RXRPC_CALL_SLOW_START) {
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_tstamp = skb->tstamp;
+ cumulative_acks = 0;
+ }
+ }
- _enter(",,%d,%d", force, terminal);
+ cumulative_acks += summary->nr_new_acks;
+ cumulative_acks += summary->nr_rot_new_acks;
+ if (cumulative_acks > 255)
+ cumulative_acks = 255;
+
+ summary->mode = call->cong_mode;
+ summary->cwnd = call->cong_cwnd;
+ summary->ssthresh = call->cong_ssthresh;
+ summary->cumulative_acks = cumulative_acks;
+ summary->dup_acks = call->cong_dup_acks;
+
+ switch (call->cong_mode) {
+ case RXRPC_CALL_SLOW_START:
+ if (summary->nr_nacks > 0)
+ goto packet_loss_detected;
+ if (summary->cumulative_acks > 0)
+ cwnd += 1;
+ if (cwnd >= call->cong_ssthresh) {
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_tstamp = skb->tstamp;
+ }
+ goto out;
- ASSERT(!irqs_disabled());
+ case RXRPC_CALL_CONGEST_AVOIDANCE:
+ if (summary->nr_nacks > 0)
+ goto packet_loss_detected;
- sp = rxrpc_skb(skb);
- ASSERTCMP(sp->call, ==, call);
-
- /* if we've already posted the terminal message for a call, then we
- * don't post any more */
- if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
- _debug("already terminated");
- ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
- rxrpc_free_skb(skb);
- return 0;
- }
-
- sk = &rx->sk;
-
- if (!force) {
- /* cast skb->rcvbuf to unsigned... It's pointless, but
- * reduces number of warnings when compiling with -W
- * --ANK */
-// ret = -ENOBUFS;
-// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-// (unsigned int) sk->sk_rcvbuf)
-// goto out;
-
- ret = sk_filter(sk, skb);
- if (ret < 0)
+ /* We analyse the number of packets that get ACK'd per RTT
+ * period and increase the window if we managed to fill it.
+ */
+ if (call->peer->rtt_usage == 0)
goto out;
- }
+ if (ktime_before(skb->tstamp,
+ ktime_add_ns(call->cong_tstamp,
+ call->peer->rtt)))
+ goto out_no_clear_ca;
+ change = rxrpc_cong_rtt_window_end;
+ call->cong_tstamp = skb->tstamp;
+ if (cumulative_acks >= cwnd)
+ cwnd++;
+ goto out;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
- !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- call->socket->sk.sk_state != RXRPC_CLOSE) {
- skb->destructor = rxrpc_packet_destructor;
- skb->dev = NULL;
- skb->sk = sk;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ case RXRPC_CALL_PACKET_LOSS:
+ if (summary->nr_nacks == 0)
+ goto resume_normality;
- if (terminal) {
- _debug("<<<< TERMINAL MESSAGE >>>>");
- set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
+ if (summary->new_low_nack) {
+ change = rxrpc_cong_new_low_nack;
+ call->cong_dup_acks = 1;
+ if (call->cong_extra > 1)
+ call->cong_extra = 1;
+ goto send_extra_data;
}
- /* allow interception by a kernel service */
- if (rx->interceptor) {
- rx->interceptor(sk, call->user_call_ID, skb);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- } else {
- _net("post skb %p", skb);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ call->cong_dup_acks++;
+ if (call->cong_dup_acks < 3)
+ goto send_extra_data;
+
+ change = rxrpc_cong_begin_retransmission;
+ call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
+ call->cong_ssthresh = max_t(unsigned int,
+ summary->flight_size / 2, 2);
+ cwnd = call->cong_ssthresh + 3;
+ call->cong_extra = 0;
+ call->cong_dup_acks = 0;
+ resend = true;
+ goto out;
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ case RXRPC_CALL_FAST_RETRANSMIT:
+ if (!summary->new_low_nack) {
+ if (summary->nr_new_acks == 0)
+ cwnd += 1;
+ call->cong_dup_acks++;
+ if (call->cong_dup_acks == 2) {
+ change = rxrpc_cong_retransmit_again;
+ call->cong_dup_acks = 0;
+ resend = true;
+ }
+ } else {
+ change = rxrpc_cong_progress;
+ cwnd = call->cong_ssthresh;
+ if (summary->nr_nacks == 0)
+ goto resume_normality;
}
- skb = NULL;
- } else {
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ goto out;
+
+ default:
+ BUG();
+ goto out;
}
- ret = 0;
+resume_normality:
+ change = rxrpc_cong_cleared_nacks;
+ call->cong_dup_acks = 0;
+ call->cong_extra = 0;
+ call->cong_tstamp = skb->tstamp;
+ if (cwnd < call->cong_ssthresh)
+ call->cong_mode = RXRPC_CALL_SLOW_START;
+ else
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
out:
- rxrpc_free_skb(skb);
+ cumulative_acks = 0;
+out_no_clear_ca:
+ if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
+ cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
+ call->cong_cwnd = cwnd;
+ call->cong_cumul_acks = cumulative_acks;
+ trace_rxrpc_congest(call, summary, acked_serial, change);
+ if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ rxrpc_queue_call(call);
+ return;
+
+packet_loss_detected:
+ change = rxrpc_cong_saw_nack;
+ call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+ call->cong_dup_acks = 0;
+ goto send_extra_data;
- _leave(" = %d", ret);
- return ret;
+send_extra_data:
+ /* Send some previously unsent DATA if we have some to advance the ACK
+ * state.
+ */
+ if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+ RXRPC_TX_ANNO_LAST ||
+ summary->nr_acks != call->tx_top - call->tx_hard_ack) {
+ call->cong_extra++;
+ wake_up(&call->waitq);
+ }
+ goto out_no_clear_ca;
}
/*
- * process a DATA packet, posting the packet to the appropriate queue
- * - eats the packet if successful
+ * Ping the other end to fill our RTT cache and to retrieve the rwind
+ * and MTU parameters.
*/
-static int rxrpc_fast_process_data(struct rxrpc_call *call,
- struct sk_buff *skb, u32 seq)
+static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
+ int skew)
{
- struct rxrpc_skb_priv *sp;
- bool terminal;
- int ret, ackbit, ack;
- u32 serial;
- u8 flags;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ ktime_t now = skb->tstamp;
- _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+ if (call->peer->rtt_usage < 3 ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+ true, true,
+ rxrpc_propose_ack_ping_for_params);
+}
- sp = rxrpc_skb(skb);
- ASSERTCMP(sp->call, ==, NULL);
- flags = sp->hdr.flags;
- serial = sp->hdr.serial;
+/*
+ * Apply a hard ACK by advancing the Tx window.
+ */
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
+ struct rxrpc_ack_summary *summary)
+{
+ struct sk_buff *skb, *list = NULL;
+ int ix;
+ u8 annotation;
+
+ if (call->acks_lowest_nak == call->tx_hard_ack) {
+ call->acks_lowest_nak = to;
+ } else if (before_eq(call->acks_lowest_nak, to)) {
+ summary->new_low_nack = true;
+ call->acks_lowest_nak = to;
+ }
spin_lock(&call->lock);
- if (call->state > RXRPC_CALL_COMPLETE)
- goto discard;
+ while (before(call->tx_hard_ack, to)) {
+ call->tx_hard_ack++;
+ ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
+ skb = call->rxtx_buffer[ix];
+ annotation = call->rxtx_annotations[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
+ call->rxtx_buffer[ix] = NULL;
+ call->rxtx_annotations[ix] = 0;
+ skb->next = list;
+ list = skb;
+
+ if (annotation & RXRPC_TX_ANNO_LAST)
+ set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+ if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
+ summary->nr_rot_new_acks++;
+ }
- ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
- ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
- ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+ spin_unlock(&call->lock);
- if (seq < call->rx_data_post) {
- _debug("dup #%u [-%u]", seq, call->rx_data_post);
- ack = RXRPC_ACK_DUPLICATE;
- ret = -ENOBUFS;
- goto discard_and_ack;
- }
+ trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
+ rxrpc_transmit_rotate_last :
+ rxrpc_transmit_rotate));
+ wake_up(&call->waitq);
- /* we may already have the packet in the out of sequence queue */
- ackbit = seq - (call->rx_data_eaten + 1);
- ASSERTCMP(ackbit, >=, 0);
- if (__test_and_set_bit(ackbit, call->ackr_window)) {
- _debug("dup oos #%u [%u,%u]",
- seq, call->rx_data_eaten, call->rx_data_post);
- ack = RXRPC_ACK_DUPLICATE;
- goto discard_and_ack;
+ while (list) {
+ skb = list;
+ list = skb->next;
+ skb->next = NULL;
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
}
+}
- if (seq >= call->ackr_win_top) {
- _debug("exceed #%u [%u]", seq, call->ackr_win_top);
- __clear_bit(ackbit, call->ackr_window);
- ack = RXRPC_ACK_EXCEEDS_WINDOW;
- goto discard_and_ack;
- }
+/*
+ * End the transmission phase of a call.
+ *
+ * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
+ * or a final ACK packet.
+ */
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
+ const char *abort_why)
+{
- if (seq == call->rx_data_expect) {
- clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
- call->rx_data_expect++;
- } else if (seq > call->rx_data_expect) {
- _debug("oos #%u [%u]", seq, call->rx_data_expect);
- call->rx_data_expect = seq + 1;
- if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
- ack = RXRPC_ACK_OUT_OF_SEQUENCE;
- goto enqueue_and_ack;
- }
- goto enqueue_packet;
- }
+ ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
- if (seq != call->rx_data_post) {
- _debug("ahead #%u [%u]", seq, call->rx_data_post);
- goto enqueue_packet;
- }
+ write_lock(&call->state_lock);
- if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
- goto protocol_error;
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ if (reply_begun)
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ else
+ call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+ break;
- /* if the packet need security things doing to it, then it goes down
- * the slow path */
- if (call->conn->security_ix)
- goto enqueue_packet;
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ __rxrpc_call_completed(call);
+ rxrpc_notify_socket(call);
+ break;
- sp->call = call;
- rxrpc_get_call(call);
- atomic_inc(&call->skb_count);
- terminal = ((flags & RXRPC_LAST_PACKET) &&
- !(flags & RXRPC_CLIENT_INITIATED));
- ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
- if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOBUFS) {
- __clear_bit(ackbit, call->ackr_window);
- ack = RXRPC_ACK_NOSPACE;
- goto discard_and_ack;
- }
- goto out;
+ default:
+ goto bad_state;
}
- skb = NULL;
- sp = NULL;
-
- _debug("post #%u", seq);
- ASSERTCMP(call->rx_data_post, ==, seq);
- call->rx_data_post++;
+ write_unlock(&call->state_lock);
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true,
+ rxrpc_propose_ack_client_tx_end);
+ trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
+ } else {
+ trace_rxrpc_transmit(call, rxrpc_transmit_end);
+ }
+ _leave(" = ok");
+ return true;
+
+bad_state:
+ write_unlock(&call->state_lock);
+ kdebug("end_tx %s", rxrpc_call_states[call->state]);
+ rxrpc_proto_abort(abort_why, call, call->tx_top);
+ return false;
+}
- if (flags & RXRPC_LAST_PACKET)
- set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
+/*
+ * Begin the reply reception phase of a call.
+ */
+static bool rxrpc_receiving_reply(struct rxrpc_call *call)
+{
+ struct rxrpc_ack_summary summary = { 0 };
+ rxrpc_seq_t top = READ_ONCE(call->tx_top);
+
+ if (call->ackr_reason) {
+ spin_lock_bh(&call->lock);
+ call->ackr_reason = 0;
+ call->resend_at = call->expire_at;
+ call->ack_at = call->expire_at;
+ spin_unlock_bh(&call->lock);
+ rxrpc_set_timer(call, rxrpc_timer_init_for_reply,
+ ktime_get_real());
+ }
- /* if we've reached an out of sequence packet then we need to drain
- * that queue into the socket Rx queue now */
- if (call->rx_data_post == call->rx_first_oos) {
- _debug("drain rx oos now");
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
- rxrpc_queue_call(call);
- read_unlock(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ rxrpc_rotate_tx_window(call, top, &summary);
+ if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+ rxrpc_proto_abort("TXL", call, top);
+ return false;
}
+ if (!rxrpc_end_tx_phase(call, true, "ETD"))
+ return false;
+ call->tx_phase = false;
+ return true;
+}
- spin_unlock(&call->lock);
- atomic_inc(&call->ackr_not_idle);
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false);
- _leave(" = 0 [posted]");
- return 0;
+/*
+ * Scan a jumbo packet to validate its structure and to work out how many
+ * subpackets it contains.
+ *
+ * A jumbo packet is a collection of consecutive packets glued together with
+ * little headers between that indicate how to change the initial header for
+ * each subpacket.
+ *
+ * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but
+ * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any
+ * size.
+ */
+static bool rxrpc_validate_jumbo(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
+ unsigned int len = skb->len;
+ int nr_jumbo = 1;
+ u8 flags = sp->hdr.flags;
-protocol_error:
- ret = -EBADMSG;
-out:
- spin_unlock(&call->lock);
- _leave(" = %d", ret);
- return ret;
+ do {
+ nr_jumbo++;
+ if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
+ goto protocol_error;
+ if (flags & RXRPC_LAST_PACKET)
+ goto protocol_error;
+ offset += RXRPC_JUMBO_DATALEN;
+ if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+ goto protocol_error;
+ offset += sizeof(struct rxrpc_jumbo_header);
+ } while (flags & RXRPC_JUMBO_PACKET);
-discard_and_ack:
- _debug("discard and ACK packet %p", skb);
- __rxrpc_propose_ACK(call, ack, serial, true);
-discard:
- spin_unlock(&call->lock);
- rxrpc_free_skb(skb);
- _leave(" = 0 [discarded]");
- return 0;
+ sp->nr_jumbo = nr_jumbo;
+ return true;
-enqueue_and_ack:
- __rxrpc_propose_ACK(call, ack, serial, true);
-enqueue_packet:
- _net("defer skb %p", skb);
- spin_unlock(&call->lock);
- skb_queue_tail(&call->rx_queue, skb);
- atomic_inc(&call->ackr_not_idle);
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD)
- rxrpc_queue_call(call);
- read_unlock(&call->state_lock);
- _leave(" = 0 [queued]");
- return 0;
+protocol_error:
+ return false;
}
/*
- * assume an implicit ACKALL of the transmission phase of a client socket upon
- * reception of the first reply packet
+ * Handle reception of a duplicate packet.
+ *
+ * We have to take care to avoid an attack here whereby we're given a series of
+ * jumbograms, each with a sequence number one before the preceding one and
+ * filled up to maximum UDP size. If they never send us the first packet in
+ * the sequence, they can cause us to have to hold on to around 2MiB of kernel
+ * space until the call times out.
+ *
+ * We limit the space usage by only accepting three duplicate jumbo packets per
+ * call. After that, we tell the other side we're no longer accepting jumbos
+ * (that information is encoded in the ACK packet).
*/
-static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
+ u8 annotation, bool *_jumbo_bad)
{
- write_lock_bh(&call->state_lock);
-
- switch (call->state) {
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
- call->acks_latest = serial;
-
- _debug("implicit ACKALL %%%u", call->acks_latest);
- set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
- write_unlock_bh(&call->state_lock);
-
- if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- }
- break;
+ /* Discard normal packets that are duplicates. */
+ if (annotation == 0)
+ return;
- default:
- write_unlock_bh(&call->state_lock);
- break;
+ /* Skip jumbo subpackets that are duplicates. When we've had three or
+ * more partially duplicate jumbo packets, we refuse to take any more
+ * jumbos for this call.
+ */
+ if (!*_jumbo_bad) {
+ call->nr_jumbo_bad++;
+ *_jumbo_bad = true;
}
}
/*
- * post an incoming packet to the nominated call to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process a DATA packet, adding the packet to the Rx ring.
*/
-void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
+ u16 skew)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- __be32 wtmp;
- u32 hi_serial, abort_code;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
+ unsigned int ix;
+ rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
+ rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
+ bool immediate_ack = false, jumbo_bad = false, queued;
+ u16 len;
+ u8 ack = 0, flags, annotation = 0;
- _enter("%p,%p", call, skb);
+ _enter("{%u,%u},{%u,%u}",
+ call->rx_hard_ack, call->rx_top, skb->len, seq);
- ASSERT(!irqs_disabled());
+ _proto("Rx DATA %%%u { #%u f=%02x }",
+ sp->hdr.serial, seq, sp->hdr.flags);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ /* Received data implicitly ACKs all of the request packets we sent
+ * when we're acting as a client.
+ */
+ if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
+ !rxrpc_receiving_reply(call))
+ return;
+
+ call->ackr_prev_seq = seq;
-#if 0 // INJECT RX ERROR
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
- static int skip = 0;
- if (++skip == 3) {
- printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
- skip = 0;
- goto free_packet;
+ hard_ack = READ_ONCE(call->rx_hard_ack);
+ if (after(seq, hard_ack + call->rx_winsize)) {
+ ack = RXRPC_ACK_EXCEEDS_WINDOW;
+ ack_serial = serial;
+ goto ack;
+ }
+
+ flags = sp->hdr.flags;
+ if (flags & RXRPC_JUMBO_PACKET) {
+ if (call->nr_jumbo_bad > 3) {
+ ack = RXRPC_ACK_NOSPACE;
+ ack_serial = serial;
+ goto ack;
}
+ annotation = 1;
}
-#endif
- /* track the latest serial number on this connection for ACK packet
- * information */
- hi_serial = atomic_read(&call->conn->hi_serial);
- while (sp->hdr.serial > hi_serial)
- hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
- sp->hdr.serial);
+next_subpacket:
+ queued = false;
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ len = skb->len;
+ if (flags & RXRPC_JUMBO_PACKET)
+ len = RXRPC_JUMBO_DATALEN;
+
+ if (flags & RXRPC_LAST_PACKET) {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ seq != call->rx_top)
+ return rxrpc_proto_abort("LSN", call, seq);
+ } else {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ after_eq(seq, call->rx_top))
+ return rxrpc_proto_abort("LSA", call, seq);
+ }
- /* request ACK generation for any ACK or DATA packet that requests
- * it */
- if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
- _proto("ACK Requested on %%%u", sp->hdr.serial);
- rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
+ if (before_eq(seq, hard_ack)) {
+ ack = RXRPC_ACK_DUPLICATE;
+ ack_serial = serial;
+ goto skip;
}
- switch (sp->hdr.type) {
- case RXRPC_PACKET_TYPE_ABORT:
- _debug("abort");
+ if (flags & RXRPC_REQUEST_ACK && !ack) {
+ ack = RXRPC_ACK_REQUESTED;
+ ack_serial = serial;
+ }
- if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
- goto protocol_error;
+ if (call->rxtx_buffer[ix]) {
+ rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad);
+ if (ack != RXRPC_ACK_DUPLICATE) {
+ ack = RXRPC_ACK_DUPLICATE;
+ ack_serial = serial;
+ }
+ immediate_ack = true;
+ goto skip;
+ }
- abort_code = ntohl(wtmp);
- _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
+ /* Queue the packet. We use a couple of memory barriers here as need
+ * to make sure that rx_top is perceived to be set after the buffer
+ * pointer and that the buffer pointer is set after the annotation and
+ * the skb data.
+ *
+ * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
+ * and also rxrpc_fill_out_ack().
+ */
+ rxrpc_get_skb(skb, rxrpc_skb_rx_got);
+ call->rxtx_annotations[ix] = annotation;
+ smp_wmb();
+ call->rxtx_buffer[ix] = skb;
+ if (after(seq, call->rx_top)) {
+ smp_store_release(&call->rx_top, seq);
+ } else if (before(seq, call->rx_top)) {
+ /* Send an immediate ACK if we fill in a hole */
+ if (!ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
+ immediate_ack = true;
+ }
+ if (flags & RXRPC_LAST_PACKET) {
+ set_bit(RXRPC_CALL_RX_LAST, &call->flags);
+ trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
+ } else {
+ trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
+ }
+ queued = true;
- write_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE) {
- call->state = RXRPC_CALL_REMOTELY_ABORTED;
- call->remote_abort = abort_code;
- set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- rxrpc_queue_call(call);
+ if (after_eq(seq, call->rx_expect_next)) {
+ if (after(seq, call->rx_expect_next)) {
+ _net("OOS %u > %u", seq, call->rx_expect_next);
+ ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+ ack_serial = serial;
}
- goto free_packet_unlock;
+ call->rx_expect_next = seq + 1;
+ }
- case RXRPC_PACKET_TYPE_BUSY:
- _proto("Rx BUSY %%%u", sp->hdr.serial);
+skip:
+ offset += len;
+ if (flags & RXRPC_JUMBO_PACKET) {
+ if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+ return rxrpc_proto_abort("XJF", call, seq);
+ offset += sizeof(struct rxrpc_jumbo_header);
+ seq++;
+ serial++;
+ annotation++;
+ if (flags & RXRPC_JUMBO_PACKET)
+ annotation |= RXRPC_RX_ANNO_JLAST;
+ if (after(seq, hard_ack + call->rx_winsize)) {
+ ack = RXRPC_ACK_EXCEEDS_WINDOW;
+ ack_serial = serial;
+ if (!jumbo_bad) {
+ call->nr_jumbo_bad++;
+ jumbo_bad = true;
+ }
+ goto ack;
+ }
- if (rxrpc_conn_is_service(call->conn))
- goto protocol_error;
+ _proto("Rx DATA Jumbo %%%u", serial);
+ goto next_subpacket;
+ }
- write_lock_bh(&call->state_lock);
- switch (call->state) {
- case RXRPC_CALL_CLIENT_SEND_REQUEST:
- call->state = RXRPC_CALL_SERVER_BUSY;
- set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
- rxrpc_queue_call(call);
- case RXRPC_CALL_SERVER_BUSY:
- goto free_packet_unlock;
- default:
- goto protocol_error_locked;
- }
+ if (queued && flags & RXRPC_LAST_PACKET && !ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
- default:
- _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
- goto protocol_error;
+ack:
+ if (ack)
+ rxrpc_propose_ACK(call, ack, skew, ack_serial,
+ immediate_ack, true,
+ rxrpc_propose_ack_input_data);
- case RXRPC_PACKET_TYPE_DATA:
- _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
+ if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
+ rxrpc_notify_socket(call);
+ _leave(" [queued]");
+}
- if (sp->hdr.seq == 0)
- goto protocol_error;
+/*
+ * Process a requested ACK.
+ */
+static void rxrpc_input_requested_ack(struct rxrpc_call *call,
+ ktime_t resp_time,
+ rxrpc_serial_t orig_serial,
+ rxrpc_serial_t ack_serial)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ ktime_t sent_at;
+ int ix;
+
+ for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) {
+ skb = call->rxtx_buffer[ix];
+ if (!skb)
+ continue;
+
+ sp = rxrpc_skb(skb);
+ if (sp->hdr.serial != orig_serial)
+ continue;
+ smp_rmb();
+ sent_at = skb->tstamp;
+ goto found;
+ }
+ return;
- call->ackr_prev_seq = sp->hdr.seq;
+found:
+ rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack,
+ orig_serial, ack_serial, sent_at, resp_time);
+}
- /* received data implicitly ACKs all of the request packets we
- * sent when we're acting as a client */
- if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
- rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
+/*
+ * Process a ping response.
+ */
+static void rxrpc_input_ping_response(struct rxrpc_call *call,
+ ktime_t resp_time,
+ rxrpc_serial_t orig_serial,
+ rxrpc_serial_t ack_serial)
+{
+ rxrpc_serial_t ping_serial;
+ ktime_t ping_time;
- switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
- case 0:
- skb = NULL;
- goto done;
+ ping_time = call->ping_time;
+ smp_rmb();
+ ping_serial = call->ping_serial;
- default:
- BUG();
+ if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
+ before(orig_serial, ping_serial))
+ return;
+ clear_bit(RXRPC_CALL_PINGING, &call->flags);
+ if (after(orig_serial, ping_serial))
+ return;
- /* data packet received beyond the last packet */
- case -EBADMSG:
- goto protocol_error;
- }
+ rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response,
+ orig_serial, ack_serial, ping_time, resp_time);
+}
- case RXRPC_PACKET_TYPE_ACKALL:
- case RXRPC_PACKET_TYPE_ACK:
- /* ACK processing is done in process context */
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD) {
- skb_queue_tail(&call->rx_queue, skb);
- rxrpc_queue_call(call);
- skb = NULL;
- }
- read_unlock_bh(&call->state_lock);
- goto free_packet;
+/*
+ * Process the extra information that may be appended to an ACK packet
+ */
+static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+ struct rxrpc_ackinfo *ackinfo)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_peer *peer;
+ unsigned int mtu;
+ u32 rwind = ntohl(ackinfo->rwind);
+
+ _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+ sp->hdr.serial,
+ ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
+ rwind, ntohl(ackinfo->jumbo_max));
+
+ if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
+ rwind = RXRPC_RXTX_BUFF_SIZE - 1;
+ call->tx_winsize = rwind;
+ if (call->cong_ssthresh > rwind)
+ call->cong_ssthresh = rwind;
+
+ mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
+
+ peer = call->peer;
+ if (mtu < peer->maxdata) {
+ spin_lock_bh(&peer->lock);
+ peer->maxdata = mtu;
+ peer->mtu = mtu + peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
}
+}
-protocol_error:
- _debug("protocol error");
- write_lock_bh(&call->state_lock);
-protocol_error_locked:
- if (call->state <= RXRPC_CALL_COMPLETE) {
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->local_abort = RX_PROTOCOL_ERROR;
- set_bit(RXRPC_CALL_EV_ABORT, &call->events);
- rxrpc_queue_call(call);
+/*
+ * Process individual soft ACKs.
+ *
+ * Each ACK in the array corresponds to one packet and can be either an ACK or
+ * a NAK. If we get find an explicitly NAK'd packet we resend immediately;
+ * packets that lie beyond the end of the ACK list are scheduled for resend by
+ * the timer on the basis that the peer might just not have processed them at
+ * the time the ACK was sent.
+ */
+static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
+ rxrpc_seq_t seq, int nr_acks,
+ struct rxrpc_ack_summary *summary)
+{
+ int ix;
+ u8 annotation, anno_type;
+
+ for (; nr_acks > 0; nr_acks--, seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ switch (*acks++) {
+ case RXRPC_ACK_TYPE_ACK:
+ summary->nr_acks++;
+ if (anno_type == RXRPC_TX_ANNO_ACK)
+ continue;
+ summary->nr_new_acks++;
+ call->rxtx_annotations[ix] =
+ RXRPC_TX_ANNO_ACK | annotation;
+ break;
+ case RXRPC_ACK_TYPE_NACK:
+ if (!summary->nr_nacks &&
+ call->acks_lowest_nak != seq) {
+ call->acks_lowest_nak = seq;
+ summary->new_low_nack = true;
+ }
+ summary->nr_nacks++;
+ if (anno_type == RXRPC_TX_ANNO_NAK)
+ continue;
+ summary->nr_new_nacks++;
+ if (anno_type == RXRPC_TX_ANNO_RETRANS)
+ continue;
+ call->rxtx_annotations[ix] =
+ RXRPC_TX_ANNO_NAK | annotation;
+ break;
+ default:
+ return rxrpc_proto_abort("SFT", call, 0);
+ }
}
-free_packet_unlock:
- write_unlock_bh(&call->state_lock);
-free_packet:
- rxrpc_free_skb(skb);
-done:
- _leave("");
}
/*
- * split up a jumbo data packet
+ * Process an ACK packet.
+ *
+ * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet
+ * in the ACK array. Anything before that is hard-ACK'd and may be discarded.
+ *
+ * A hard-ACK means that a packet has been processed and may be discarded; a
+ * soft-ACK means that the packet may be discarded and retransmission
+ * requested. A phase is complete when all packets are hard-ACK'd.
*/
-static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
- struct sk_buff *jumbo)
+static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
+ u16 skew)
{
- struct rxrpc_jumbo_header jhdr;
- struct rxrpc_skb_priv *sp;
- struct sk_buff *part;
+ struct rxrpc_ack_summary summary = { 0 };
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ union {
+ struct rxrpc_ackpacket ack;
+ struct rxrpc_ackinfo info;
+ u8 acks[RXRPC_MAXACKS];
+ } buf;
+ rxrpc_serial_t acked_serial;
+ rxrpc_seq_t first_soft_ack, hard_ack;
+ int nr_acks, offset, ioffset;
+
+ _enter("");
+
+ offset = sizeof(struct rxrpc_wire_header);
+ if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) {
+ _debug("extraction failure");
+ return rxrpc_proto_abort("XAK", call, 0);
+ }
+ offset += sizeof(buf.ack);
+
+ acked_serial = ntohl(buf.ack.serial);
+ first_soft_ack = ntohl(buf.ack.firstPacket);
+ hard_ack = first_soft_ack - 1;
+ nr_acks = buf.ack.nAcks;
+ summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
+ buf.ack.reason : RXRPC_ACK__INVALID);
+
+ trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
+
+ _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+ sp->hdr.serial,
+ ntohs(buf.ack.maxSkew),
+ first_soft_ack,
+ ntohl(buf.ack.previousPacket),
+ acked_serial,
+ rxrpc_ack_names[summary.ack_reason],
+ buf.ack.nAcks);
+
+ if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
+ rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
+ sp->hdr.serial);
+ if (buf.ack.reason == RXRPC_ACK_REQUESTED)
+ rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
+ sp->hdr.serial);
+
+ if (buf.ack.reason == RXRPC_ACK_PING) {
+ _proto("Rx ACK %%%u PING Request", sp->hdr.serial);
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+ skew, sp->hdr.serial, true, true,
+ rxrpc_propose_ack_respond_to_ping);
+ } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
+ skew, sp->hdr.serial, true, true,
+ rxrpc_propose_ack_respond_to_ack);
+ }
- _enter(",{%u,%u}", jumbo->data_len, jumbo->len);
+ ioffset = offset + nr_acks + 3;
+ if (skb->len >= ioffset + sizeof(buf.info)) {
+ if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
+ return rxrpc_proto_abort("XAI", call, 0);
+ rxrpc_input_ackinfo(call, skb, &buf.info);
+ }
- sp = rxrpc_skb(jumbo);
+ if (first_soft_ack == 0)
+ return rxrpc_proto_abort("AK0", call, 0);
- do {
- sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
-
- /* make a clone to represent the first subpacket in what's left
- * of the jumbo packet */
- part = skb_clone(jumbo, GFP_ATOMIC);
- if (!part) {
- /* simply ditch the tail in the event of ENOMEM */
- pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
- break;
- }
- rxrpc_new_skb(part);
+ /* Ignore ACKs unless we are or have just been transmitting. */
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ case RXRPC_CALL_SERVER_SEND_REPLY:
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ break;
+ default:
+ return;
+ }
- pskb_trim(part, RXRPC_JUMBO_DATALEN);
+ /* Discard any out-of-order or duplicate ACKs. */
+ if (before_eq(sp->hdr.serial, call->acks_latest)) {
+ _debug("discard ACK %d <= %d",
+ sp->hdr.serial, call->acks_latest);
+ return;
+ }
+ call->acks_latest_ts = skb->tstamp;
+ call->acks_latest = sp->hdr.serial;
+
+ if (before(hard_ack, call->tx_hard_ack) ||
+ after(hard_ack, call->tx_top))
+ return rxrpc_proto_abort("AKW", call, 0);
+ if (nr_acks > call->tx_top - hard_ack)
+ return rxrpc_proto_abort("AKN", call, 0);
+
+ if (after(hard_ack, call->tx_hard_ack))
+ rxrpc_rotate_tx_window(call, hard_ack, &summary);
+
+ if (nr_acks > 0) {
+ if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0)
+ return rxrpc_proto_abort("XSA", call, 0);
+ rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
+ &summary);
+ }
- if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
- goto protocol_error;
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+ rxrpc_end_tx_phase(call, false, "ETA");
+ return;
+ }
- if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
- goto protocol_error;
- if (!pskb_pull(jumbo, sizeof(jhdr)))
- BUG();
+ if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+ RXRPC_TX_ANNO_LAST &&
+ summary.nr_acks == call->tx_top - hard_ack &&
+ rxrpc_is_client_call(call))
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+ false, true,
+ rxrpc_propose_ack_ping_for_lost_reply);
- sp->hdr.seq += 1;
- sp->hdr.serial += 1;
- sp->hdr.flags = jhdr.flags;
- sp->hdr._rsvd = ntohs(jhdr._rsvd);
+ return rxrpc_congestion_management(call, skb, &summary, acked_serial);
+}
- _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
+/*
+ * Process an ACKALL packet.
+ */
+static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_ack_summary summary = { 0 };
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- rxrpc_fast_process_packet(call, part);
- part = NULL;
+ _proto("Rx ACKALL %%%u", sp->hdr.serial);
- } while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+ rxrpc_rotate_tx_window(call, call->tx_top, &summary);
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ rxrpc_end_tx_phase(call, false, "ETL");
+}
- rxrpc_fast_process_packet(call, jumbo);
- _leave("");
- return;
+/*
+ * Process an ABORT packet.
+ */
+static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ __be32 wtmp;
+ u32 abort_code = RX_CALL_DEAD;
-protocol_error:
- _debug("protocol error");
- rxrpc_free_skb(part);
- rxrpc_free_skb(jumbo);
- write_lock_bh(&call->state_lock);
- if (call->state <= RXRPC_CALL_COMPLETE) {
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->local_abort = RX_PROTOCOL_ERROR;
- set_bit(RXRPC_CALL_EV_ABORT, &call->events);
- rxrpc_queue_call(call);
- }
- write_unlock_bh(&call->state_lock);
- _leave("");
+ _enter("");
+
+ if (skb->len >= 4 &&
+ skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &wtmp, sizeof(wtmp)) >= 0)
+ abort_code = ntohl(wtmp);
+
+ _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
+
+ if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+ abort_code, ECONNABORTED))
+ rxrpc_notify_socket(call);
}
/*
- * post an incoming packet to the appropriate call/socket to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process an incoming call packet.
*/
-static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
- struct sk_buff *skb)
+static void rxrpc_input_call_packet(struct rxrpc_call *call,
+ struct sk_buff *skb, u16 skew)
{
- struct rxrpc_skb_priv *sp;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
_enter("%p,%p", call, skb);
- sp = rxrpc_skb(skb);
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_DATA:
+ rxrpc_input_data(call, skb, skew);
+ break;
+
+ case RXRPC_PACKET_TYPE_ACK:
+ rxrpc_input_ack(call, skb, skew);
+ break;
- _debug("extant call [%d]", call->state);
+ case RXRPC_PACKET_TYPE_BUSY:
+ _proto("Rx BUSY %%%u", sp->hdr.serial);
+
+ /* Just ignore BUSY packets from the server; the retry and
+ * lifespan timers will take care of business. BUSY packets
+ * from the client don't make sense.
+ */
+ break;
+
+ case RXRPC_PACKET_TYPE_ABORT:
+ rxrpc_input_abort(call, skb);
+ break;
- read_lock(&call->state_lock);
+ case RXRPC_PACKET_TYPE_ACKALL:
+ rxrpc_input_ackall(call, skb);
+ break;
+
+ default:
+ _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
+ break;
+ }
+
+ _leave("");
+}
+
+/*
+ * Handle a new call on a channel implicitly completing the preceding call on
+ * that channel.
+ *
+ * TODO: If callNumber > call_id + 1, renegotiate security.
+ */
+static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
+ struct rxrpc_call *call)
+{
switch (call->state) {
- case RXRPC_CALL_LOCALLY_ABORTED:
- if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
- rxrpc_queue_call(call);
- goto free_unlock;
- }
- case RXRPC_CALL_REMOTELY_ABORTED:
- case RXRPC_CALL_NETWORK_ERROR:
- case RXRPC_CALL_DEAD:
- goto dead_call;
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ rxrpc_call_completed(call);
+ break;
case RXRPC_CALL_COMPLETE:
- case RXRPC_CALL_CLIENT_FINAL_ACK:
- /* complete server call */
- if (rxrpc_conn_is_service(call->conn))
- goto dead_call;
- /* resend last packet of a completed call */
- _debug("final ack again");
- rxrpc_get_call(call);
- set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
- rxrpc_queue_call(call);
- goto free_unlock;
+ break;
default:
+ if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, ESHUTDOWN)) {
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
break;
}
- read_unlock(&call->state_lock);
- rxrpc_get_call(call);
-
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
- sp->hdr.flags & RXRPC_JUMBO_PACKET)
- rxrpc_process_jumbo_packet(call, skb);
- else
- rxrpc_fast_process_packet(call, skb);
-
- rxrpc_put_call(call);
- goto done;
-
-dead_call:
- if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
- skb->priority = RX_CALL_DEAD;
- rxrpc_reject_packet(call->conn->params.local, skb);
- goto unlock;
- }
-free_unlock:
- rxrpc_free_skb(skb);
-unlock:
- read_unlock(&call->state_lock);
-done:
- _leave("");
+ __rxrpc_disconnect_call(conn, call);
+ rxrpc_notify_socket(call);
}
/*
* post connection-level events to the connection
- * - this includes challenges, responses and some aborts
+ * - this includes challenges, responses, some aborts and call terminal packet
+ * retransmission.
*/
static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
struct sk_buff *skb)
@@ -595,6 +993,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
}
/*
+ * put a packet up for transport-level abort
+ */
+static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+ CHECK_SLAB_OKAY(&local->usage);
+
+ skb_queue_tail(&local->reject_queue, skb);
+ rxrpc_queue_local(local);
+}
+
+/*
* Extract the wire header from a packet and translate the byte order.
*/
static noinline
@@ -605,8 +1014,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
/* dig out the RxRPC connection details */
if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
return -EBADMSG;
- if (!pskb_pull(skb, sizeof(whdr)))
- BUG();
memset(sp, 0, sizeof(*sp));
sp->hdr.epoch = ntohl(whdr.epoch);
@@ -631,19 +1038,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
* shut down and the local endpoint from going away, thus sk_user_data will not
* be cleared until this function returns.
*/
-void rxrpc_data_ready(struct sock *sk)
+void rxrpc_data_ready(struct sock *udp_sk)
{
struct rxrpc_connection *conn;
+ struct rxrpc_channel *chan;
+ struct rxrpc_call *call;
struct rxrpc_skb_priv *sp;
- struct rxrpc_local *local = sk->sk_user_data;
+ struct rxrpc_local *local = udp_sk->sk_user_data;
struct sk_buff *skb;
- int ret;
+ unsigned int channel;
+ int ret, skew;
- _enter("%p", sk);
+ _enter("%p", udp_sk);
ASSERT(!irqs_disabled());
- skb = skb_recv_datagram(sk, 0, 1, &ret);
+ skb = skb_recv_udp(udp_sk, 0, 1, &ret);
if (!skb) {
if (ret == -EAGAIN)
return;
@@ -651,13 +1061,13 @@ void rxrpc_data_ready(struct sock *sk)
return;
}
- rxrpc_new_skb(skb);
+ rxrpc_new_skb(skb, rxrpc_skb_rx_received);
_net("recv skb %p", skb);
/* we'll probably need to checksum it (didn't call sock_recvmsg) */
if (skb_checksum_complete(skb)) {
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
_leave(" [CSUM failed]");
return;
@@ -665,19 +1075,26 @@ void rxrpc_data_ready(struct sock *sk)
__UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
- /* The socket buffer we have is owned by UDP, with UDP's data all over
- * it, but we really want our own data there.
+ /* The UDP protocol already released all skb resources;
+ * we are free to add our own data there.
*/
- skb_orphan(skb);
sp = rxrpc_skb(skb);
- _net("Rx UDP packet from %08x:%04hu",
- ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
-
/* dig out the RxRPC connection details */
if (rxrpc_extract_header(sp, skb) < 0)
goto bad_message;
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+ if ((lose++ & 7) == 7) {
+ trace_rxrpc_rx_lose(sp);
+ rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
+ return;
+ }
+ }
+
+ trace_rxrpc_rx_packet(sp);
+
_net("Rx RxRPC %s ep=%x call=%x:%x",
sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
@@ -688,70 +1105,135 @@ void rxrpc_data_ready(struct sock *sk)
goto bad_message;
}
- if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) {
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_VERSION:
rxrpc_post_packet_to_local(local, skb);
goto out;
- }
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
- (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
- goto bad_message;
+ case RXRPC_PACKET_TYPE_BUSY:
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
+ goto discard;
+
+ case RXRPC_PACKET_TYPE_DATA:
+ if (sp->hdr.callNumber == 0)
+ goto bad_message;
+ if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
+ !rxrpc_validate_jumbo(skb))
+ goto bad_message;
+ break;
+ }
rcu_read_lock();
conn = rxrpc_find_connection_rcu(local, skb);
- if (!conn)
- goto cant_route_call;
+ if (conn) {
+ if (sp->hdr.securityIndex != conn->security_ix)
+ goto wrong_security;
+
+ if (sp->hdr.callNumber == 0) {
+ /* Connection-level packet */
+ _debug("CONN %p {%d}", conn, conn->debug_id);
+ rxrpc_post_packet_to_conn(conn, skb);
+ goto out_unlock;
+ }
+
+ /* Note the serial number skew here */
+ skew = (int)sp->hdr.serial - (int)conn->hi_serial;
+ if (skew >= 0) {
+ if (skew > 0)
+ conn->hi_serial = sp->hdr.serial;
+ } else {
+ skew = -skew;
+ skew = min(skew, 65535);
+ }
- if (sp->hdr.callNumber == 0) {
- /* Connection-level packet */
- _debug("CONN %p {%d}", conn, conn->debug_id);
- rxrpc_post_packet_to_conn(conn, skb);
- } else {
/* Call-bound packets are routed by connection channel. */
- unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK;
- struct rxrpc_channel *chan = &conn->channels[channel];
- struct rxrpc_call *call = rcu_dereference(chan->call);
+ channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+ chan = &conn->channels[channel];
+
+ /* Ignore really old calls */
+ if (sp->hdr.callNumber < chan->last_call)
+ goto discard_unlock;
+
+ if (sp->hdr.callNumber == chan->last_call) {
+ /* For the previous service call, if completed successfully, we
+ * discard all further packets.
+ */
+ if (rxrpc_conn_is_service(conn) &&
+ (chan->last_type == RXRPC_PACKET_TYPE_ACK ||
+ sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
+ goto discard_unlock;
+
+ /* But otherwise we need to retransmit the final packet from
+ * data cached in the connection record.
+ */
+ rxrpc_post_packet_to_conn(conn, skb);
+ goto out_unlock;
+ }
- if (!call || atomic_read(&call->usage) == 0)
- goto cant_route_call;
+ call = rcu_dereference(chan->call);
+
+ if (sp->hdr.callNumber > chan->call_id) {
+ if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) {
+ rcu_read_unlock();
+ goto reject_packet;
+ }
+ if (call)
+ rxrpc_input_implicit_end_call(conn, call);
+ call = NULL;
+ }
+ } else {
+ skew = 0;
+ call = NULL;
+ }
- rxrpc_post_packet_to_call(call, skb);
+ if (!call || atomic_read(&call->usage) == 0) {
+ if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) ||
+ sp->hdr.callNumber == 0 ||
+ sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
+ goto bad_message_unlock;
+ if (sp->hdr.seq != 1)
+ goto discard_unlock;
+ call = rxrpc_new_incoming_call(local, conn, skb);
+ if (!call) {
+ rcu_read_unlock();
+ goto reject_packet;
+ }
+ rxrpc_send_ping(call, skb, skew);
}
+ rxrpc_input_call_packet(call, skb, skew);
+ goto discard_unlock;
+
+discard_unlock:
rcu_read_unlock();
+discard:
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
out:
+ trace_rxrpc_rx_done(0, 0);
return;
-cant_route_call:
+out_unlock:
rcu_read_unlock();
+ goto out;
- _debug("can't route call");
- if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
- sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
- if (sp->hdr.seq == 1) {
- _debug("first packet");
- skb_queue_tail(&local->accept_queue, skb);
- rxrpc_queue_work(&local->processor);
- _leave(" [incoming]");
- return;
- }
- skb->priority = RX_INVALID_OPERATION;
- } else {
- skb->priority = RX_CALL_DEAD;
- }
-
- if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
- _debug("reject type %d",sp->hdr.type);
- rxrpc_reject_packet(local, skb);
- } else {
- rxrpc_free_skb(skb);
- }
- _leave(" [no call]");
- return;
+wrong_security:
+ rcu_read_unlock();
+ trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RXKADINCONSISTENCY, EBADMSG);
+ skb->priority = RXKADINCONSISTENCY;
+ goto post_abort;
+bad_message_unlock:
+ rcu_read_unlock();
bad_message:
+ trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_PROTOCOL_ERROR, EBADMSG);
skb->priority = RX_PROTOCOL_ERROR;
+post_abort:
+ skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+reject_packet:
+ trace_rxrpc_rx_done(skb->mark, skb->priority);
rxrpc_reject_packet(local, skb);
_leave(" [badmsg]");
}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index c21ad213b337..7d4375e557e6 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -23,31 +23,36 @@ static int none_prime_packet_security(struct rxrpc_connection *conn)
}
static int none_secure_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- size_t data_size,
- void *sechdr)
+ struct sk_buff *skb,
+ size_t data_size,
+ void *sechdr)
{
return 0;
}
-static int none_verify_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq, u16 expected_cksum)
{
return 0;
}
+static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+}
+
static int none_respond_to_challenge(struct rxrpc_connection *conn,
- struct sk_buff *skb,
- u32 *_abort_code)
+ struct sk_buff *skb,
+ u32 *_abort_code)
{
*_abort_code = RX_PROTOCOL_ERROR;
return -EPROTO;
}
static int none_verify_response(struct rxrpc_connection *conn,
- struct sk_buff *skb,
- u32 *_abort_code)
+ struct sk_buff *skb,
+ u32 *_abort_code)
{
*_abort_code = RX_PROTOCOL_ERROR;
return -EPROTO;
@@ -78,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = {
.prime_packet_security = none_prime_packet_security,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
+ .locate_data = none_locate_data,
.respond_to_challenge = none_respond_to_challenge,
.verify_response = none_verify_response,
.clear = none_clear,
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index 31a3f86ef2f6..540d3955c1bc 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -15,8 +15,6 @@
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
-#include <linux/udp.h>
-#include <linux/ip.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <generated/utsrelease.h>
@@ -33,7 +31,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
{
struct rxrpc_wire_header whdr;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct sockaddr_in sin;
+ struct sockaddr_rxrpc srx;
struct msghdr msg;
struct kvec iov[2];
size_t len;
@@ -41,12 +39,11 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
_enter("");
- sin.sin_family = AF_INET;
- sin.sin_port = udp_hdr(skb)->source;
- sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
+ return;
- msg.msg_name = &sin;
- msg.msg_namelen = sizeof(sin);
+ msg.msg_name = &srx.transport;
+ msg.msg_namelen = srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -93,11 +90,13 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
if (skb) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_VERSION:
- if (skb_copy_bits(skb, 0, &v, 1) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &v, 1) < 0)
return;
_proto("Rx VERSION { %02x }", v);
if (v == 0)
@@ -109,7 +108,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
break;
}
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
}
_leave("");
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index a753796fbe8f..ff4864d550b8 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -58,6 +58,17 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
memcmp(&local->srx.transport.sin.sin_addr,
&srx->transport.sin.sin_addr,
sizeof(struct in_addr));
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ /* If the choice of UDP6 port is left up to the transport, then
+ * the endpoint record doesn't match.
+ */
+ return ((u16 __force)local->srx.transport.sin6.sin6_port -
+ (u16 __force)srx->transport.sin6.sin6_port) ?:
+ memcmp(&local->srx.transport.sin6.sin6_addr,
+ &srx->transport.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+#endif
default:
BUG();
}
@@ -75,9 +86,7 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
atomic_set(&local->usage, 1);
INIT_LIST_HEAD(&local->link);
INIT_WORK(&local->processor, rxrpc_local_processor);
- INIT_LIST_HEAD(&local->services);
init_rwsem(&local->defrag_sem);
- skb_queue_head_init(&local->accept_queue);
skb_queue_head_init(&local->reject_queue);
skb_queue_head_init(&local->event_queue);
local->client_conns = RB_ROOT;
@@ -101,11 +110,12 @@ static int rxrpc_open_socket(struct rxrpc_local *local)
struct sock *sock;
int ret, opt;
- _enter("%p{%d}", local, local->srx.transport_type);
+ _enter("%p{%d,%d}",
+ local, local->srx.transport_type, local->srx.transport.family);
/* create a socket to represent the local endpoint */
- ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
- IPPROTO_UDP, &local->socket);
+ ret = sock_create_kern(&init_net, local->srx.transport.family,
+ local->srx.transport_type, 0, &local->socket);
if (ret < 0) {
_leave(" = %d [socket]", ret);
return ret;
@@ -170,18 +180,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
long diff;
int ret;
- if (srx->transport.family == AF_INET) {
- _enter("{%d,%u,%pI4+%hu}",
- srx->transport_type,
- srx->transport.family,
- &srx->transport.sin.sin_addr,
- ntohs(srx->transport.sin.sin_port));
- } else {
- _enter("{%d,%u}",
- srx->transport_type,
- srx->transport.family);
- return ERR_PTR(-EAFNOSUPPORT);
- }
+ _enter("{%d,%d,%pISp}",
+ srx->transport_type, srx->transport.family, &srx->transport);
mutex_lock(&rxrpc_local_mutex);
@@ -234,13 +234,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
found:
mutex_unlock(&rxrpc_local_mutex);
- _net("LOCAL %s %d {%d,%u,%pI4+%hu}",
- age,
- local->debug_id,
- local->srx.transport_type,
- local->srx.transport.family,
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port));
+ _net("LOCAL %s %d {%pISp}",
+ age, local->debug_id, &local->srx.transport);
_leave(" = %p", local);
return local;
@@ -296,7 +291,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
mutex_unlock(&rxrpc_local_mutex);
ASSERT(RB_EMPTY_ROOT(&local->client_conns));
- ASSERT(list_empty(&local->services));
+ ASSERT(!local->service);
if (socket) {
local->socket = NULL;
@@ -308,7 +303,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
/* At this point, there should be no more packets coming in to the
* local endpoint.
*/
- rxrpc_purge_queue(&local->accept_queue);
rxrpc_purge_queue(&local->reject_queue);
rxrpc_purge_queue(&local->event_queue);
@@ -332,11 +326,6 @@ static void rxrpc_local_processor(struct work_struct *work)
if (atomic_read(&local->usage) == 0)
return rxrpc_local_destroyer(local);
- if (!skb_queue_empty(&local->accept_queue)) {
- rxrpc_accept_incoming_calls(local);
- again = true;
- }
-
if (!skb_queue_empty(&local->reject_queue)) {
rxrpc_reject_packets(local);
again = true;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index bdc5e42fe600..6dee55fad2d3 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -21,28 +21,33 @@
unsigned int rxrpc_max_backlog __read_mostly = 10;
/*
+ * Maximum lifetime of a call (in mx).
+ */
+unsigned int rxrpc_max_call_lifetime = 60 * 1000;
+
+/*
* How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in jiffies).
+ * packet with RXRPC_REQUEST_ACK set (in ms).
*/
unsigned int rxrpc_requested_ack_delay = 1;
/*
- * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
+ * How long to wait before scheduling an ACK with subtype DELAY (in ms).
*
* We use this when we've received new data packets. If those packets aren't
* all consumed within this time we will send a DELAY ACK if an ACK was not
* requested to let the sender know it doesn't need to resend.
*/
-unsigned int rxrpc_soft_ack_delay = 1 * HZ;
+unsigned int rxrpc_soft_ack_delay = 1 * 1000;
/*
- * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
+ * How long to wait before scheduling an ACK with subtype IDLE (in ms).
*
* We use this when we've consumed some previously soft-ACK'd packets when
* further packets aren't immediately received to decide when to send an IDLE
* ACK let the other end know that it can free up its Tx buffer space.
*/
-unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
+unsigned int rxrpc_idle_ack_delay = 0.5 * 1000;
/*
* Receive window size in packets. This indicates the maximum number of
@@ -50,7 +55,10 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
* limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
* packets.
*/
-unsigned int rxrpc_rx_window_size = 32;
+unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE;
+#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE
+#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE
+#endif
/*
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
@@ -64,6 +72,11 @@ unsigned int rxrpc_rx_mtu = 5692;
*/
unsigned int rxrpc_rx_jumbo_max = 4;
+/*
+ * Time till packet resend (in milliseconds).
+ */
+unsigned int rxrpc_resend_timeout = 4 * 1000;
+
const char *const rxrpc_pkts[] = {
"?00",
"DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
@@ -75,21 +88,154 @@ const s8 rxrpc_ack_priority[] = {
[RXRPC_ACK_DELAY] = 1,
[RXRPC_ACK_REQUESTED] = 2,
[RXRPC_ACK_IDLE] = 3,
- [RXRPC_ACK_PING_RESPONSE] = 4,
- [RXRPC_ACK_DUPLICATE] = 5,
- [RXRPC_ACK_OUT_OF_SEQUENCE] = 6,
- [RXRPC_ACK_EXCEEDS_WINDOW] = 7,
- [RXRPC_ACK_NOSPACE] = 8,
-};
-
-const char *rxrpc_acks(u8 reason)
-{
- static const char *const str[] = {
- "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
- "IDL", "-?-"
- };
-
- if (reason >= ARRAY_SIZE(str))
- reason = ARRAY_SIZE(str) - 1;
- return str[reason];
-}
+ [RXRPC_ACK_DUPLICATE] = 4,
+ [RXRPC_ACK_OUT_OF_SEQUENCE] = 5,
+ [RXRPC_ACK_EXCEEDS_WINDOW] = 6,
+ [RXRPC_ACK_NOSPACE] = 7,
+ [RXRPC_ACK_PING_RESPONSE] = 8,
+};
+
+const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
+ "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+ "IDL", "-?-"
+};
+
+const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
+ [rxrpc_skb_rx_cleaned] = "Rx CLN",
+ [rxrpc_skb_rx_freed] = "Rx FRE",
+ [rxrpc_skb_rx_got] = "Rx GOT",
+ [rxrpc_skb_rx_lost] = "Rx *L*",
+ [rxrpc_skb_rx_received] = "Rx RCV",
+ [rxrpc_skb_rx_purged] = "Rx PUR",
+ [rxrpc_skb_rx_rotated] = "Rx ROT",
+ [rxrpc_skb_rx_seen] = "Rx SEE",
+ [rxrpc_skb_tx_cleaned] = "Tx CLN",
+ [rxrpc_skb_tx_freed] = "Tx FRE",
+ [rxrpc_skb_tx_got] = "Tx GOT",
+ [rxrpc_skb_tx_new] = "Tx NEW",
+ [rxrpc_skb_tx_rotated] = "Tx ROT",
+ [rxrpc_skb_tx_seen] = "Tx SEE",
+};
+
+const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
+ [rxrpc_conn_new_client] = "NWc",
+ [rxrpc_conn_new_service] = "NWs",
+ [rxrpc_conn_queued] = "QUE",
+ [rxrpc_conn_seen] = "SEE",
+ [rxrpc_conn_got] = "GOT",
+ [rxrpc_conn_put_client] = "PTc",
+ [rxrpc_conn_put_service] = "PTs",
+};
+
+const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
+ [rxrpc_client_activate_chans] = "Activa",
+ [rxrpc_client_alloc] = "Alloc ",
+ [rxrpc_client_chan_activate] = "ChActv",
+ [rxrpc_client_chan_disconnect] = "ChDisc",
+ [rxrpc_client_chan_pass] = "ChPass",
+ [rxrpc_client_chan_unstarted] = "ChUnst",
+ [rxrpc_client_cleanup] = "Clean ",
+ [rxrpc_client_count] = "Count ",
+ [rxrpc_client_discard] = "Discar",
+ [rxrpc_client_duplicate] = "Duplic",
+ [rxrpc_client_exposed] = "Expose",
+ [rxrpc_client_replace] = "Replac",
+ [rxrpc_client_to_active] = "->Actv",
+ [rxrpc_client_to_culled] = "->Cull",
+ [rxrpc_client_to_idle] = "->Idle",
+ [rxrpc_client_to_inactive] = "->Inac",
+ [rxrpc_client_to_waiting] = "->Wait",
+ [rxrpc_client_uncount] = "Uncoun",
+};
+
+const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
+ [rxrpc_transmit_wait] = "WAI",
+ [rxrpc_transmit_queue] = "QUE",
+ [rxrpc_transmit_queue_last] = "QLS",
+ [rxrpc_transmit_rotate] = "ROT",
+ [rxrpc_transmit_rotate_last] = "RLS",
+ [rxrpc_transmit_await_reply] = "AWR",
+ [rxrpc_transmit_end] = "END",
+};
+
+const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
+ [rxrpc_receive_incoming] = "INC",
+ [rxrpc_receive_queue] = "QUE",
+ [rxrpc_receive_queue_last] = "QLS",
+ [rxrpc_receive_front] = "FRN",
+ [rxrpc_receive_rotate] = "ROT",
+ [rxrpc_receive_end] = "END",
+};
+
+const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
+ [rxrpc_recvmsg_enter] = "ENTR",
+ [rxrpc_recvmsg_wait] = "WAIT",
+ [rxrpc_recvmsg_dequeue] = "DEQU",
+ [rxrpc_recvmsg_hole] = "HOLE",
+ [rxrpc_recvmsg_next] = "NEXT",
+ [rxrpc_recvmsg_cont] = "CONT",
+ [rxrpc_recvmsg_full] = "FULL",
+ [rxrpc_recvmsg_data_return] = "DATA",
+ [rxrpc_recvmsg_terminal] = "TERM",
+ [rxrpc_recvmsg_to_be_accepted] = "TBAC",
+ [rxrpc_recvmsg_return] = "RETN",
+};
+
+const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
+ [rxrpc_rtt_tx_ping] = "PING",
+ [rxrpc_rtt_tx_data] = "DATA",
+};
+
+const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
+ [rxrpc_rtt_rx_ping_response] = "PONG",
+ [rxrpc_rtt_rx_requested_ack] = "RACK",
+};
+
+const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
+ [rxrpc_timer_begin] = "Begin ",
+ [rxrpc_timer_expired] = "*EXPR*",
+ [rxrpc_timer_init_for_reply] = "IniRpl",
+ [rxrpc_timer_init_for_send_reply] = "SndRpl",
+ [rxrpc_timer_set_for_ack] = "SetAck",
+ [rxrpc_timer_set_for_ping] = "SetPng",
+ [rxrpc_timer_set_for_send] = "SetTx ",
+ [rxrpc_timer_set_for_resend] = "SetRTx",
+};
+
+const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
+ [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
+ [rxrpc_propose_ack_input_data] = "DataIn ",
+ [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
+ [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
+ [rxrpc_propose_ack_ping_for_params] = "Params ",
+ [rxrpc_propose_ack_processing_op] = "ProcOp ",
+ [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
+ [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
+ [rxrpc_propose_ack_retry_tx] = "RetryTx",
+ [rxrpc_propose_ack_rotate_rx] = "RxAck ",
+ [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
+};
+
+const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
+ [rxrpc_propose_ack_use] = "",
+ [rxrpc_propose_ack_update] = " Update",
+ [rxrpc_propose_ack_subsume] = " Subsume",
+};
+
+const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
+ [RXRPC_CALL_SLOW_START] = "SlowStart",
+ [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
+ [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
+ [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
+};
+
+const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
+ [rxrpc_cong_begin_retransmission] = " Retrans",
+ [rxrpc_cong_cleared_nacks] = " Cleared",
+ [rxrpc_cong_new_low_nack] = " NewLowN",
+ [rxrpc_cong_no_change] = "",
+ [rxrpc_cong_progress] = " Progres",
+ [rxrpc_cong_retransmit_again] = " ReTxAgn",
+ [rxrpc_cong_rtt_window_end] = " RttWinE",
+ [rxrpc_cong_saw_nack] = " SawNack",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f4bda06b7d2d..5dab1ff3a6c2 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -14,336 +14,351 @@
#include <linux/net.h>
#include <linux/gfp.h>
#include <linux/skbuff.h>
-#include <linux/circ_buf.h>
#include <linux/export.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-/*
- * Time till packet resend (in jiffies).
- */
-unsigned int rxrpc_resend_timeout = 4 * HZ;
-
-static int rxrpc_send_data(struct rxrpc_sock *rx,
- struct rxrpc_call *call,
- struct msghdr *msg, size_t len);
-
-/*
- * extract control messages from the sendmsg() control buffer
- */
-static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
- unsigned long *user_call_ID,
- enum rxrpc_command *command,
- u32 *abort_code,
- bool *_exclusive)
-{
- struct cmsghdr *cmsg;
- bool got_user_ID = false;
- int len;
-
- *command = RXRPC_CMD_SEND_DATA;
-
- if (msg->msg_controllen == 0)
- return -EINVAL;
-
- for_each_cmsghdr(cmsg, msg) {
- if (!CMSG_OK(msg, cmsg))
- return -EINVAL;
-
- len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
- _debug("CMSG %d, %d, %d",
- cmsg->cmsg_level, cmsg->cmsg_type, len);
-
- if (cmsg->cmsg_level != SOL_RXRPC)
- continue;
-
- switch (cmsg->cmsg_type) {
- case RXRPC_USER_CALL_ID:
- if (msg->msg_flags & MSG_CMSG_COMPAT) {
- if (len != sizeof(u32))
- return -EINVAL;
- *user_call_ID = *(u32 *) CMSG_DATA(cmsg);
- } else {
- if (len != sizeof(unsigned long))
- return -EINVAL;
- *user_call_ID = *(unsigned long *)
- CMSG_DATA(cmsg);
- }
- _debug("User Call ID %lx", *user_call_ID);
- got_user_ID = true;
- break;
-
- case RXRPC_ABORT:
- if (*command != RXRPC_CMD_SEND_DATA)
- return -EINVAL;
- *command = RXRPC_CMD_SEND_ABORT;
- if (len != sizeof(*abort_code))
- return -EINVAL;
- *abort_code = *(unsigned int *) CMSG_DATA(cmsg);
- _debug("Abort %x", *abort_code);
- if (*abort_code == 0)
- return -EINVAL;
- break;
-
- case RXRPC_ACCEPT:
- if (*command != RXRPC_CMD_SEND_DATA)
- return -EINVAL;
- *command = RXRPC_CMD_ACCEPT;
- if (len != 0)
- return -EINVAL;
- break;
-
- case RXRPC_EXCLUSIVE_CALL:
- *_exclusive = true;
- if (len != 0)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- }
+struct rxrpc_ack_buffer {
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_ackpacket ack;
+ u8 acks[255];
+ u8 pad[3];
+ struct rxrpc_ackinfo ackinfo;
+};
- if (!got_user_ID)
- return -EINVAL;
- _leave(" = 0");
- return 0;
-}
+struct rxrpc_abort_buffer {
+ struct rxrpc_wire_header whdr;
+ __be32 abort_code;
+};
/*
- * abort a call, sending an ABORT packet to the peer
+ * Fill out an ACK packet.
*/
-static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
+static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
+ struct rxrpc_ack_buffer *pkt,
+ rxrpc_seq_t *_hard_ack,
+ rxrpc_seq_t *_top,
+ u8 reason)
{
- write_lock_bh(&call->state_lock);
-
- if (call->state <= RXRPC_CALL_COMPLETE) {
- call->state = RXRPC_CALL_LOCALLY_ABORTED;
- call->local_abort = abort_code;
- set_bit(RXRPC_CALL_EV_ABORT, &call->events);
- del_timer_sync(&call->resend_timer);
- del_timer_sync(&call->ack_timer);
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_EV_ACK, &call->events);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- rxrpc_queue_call(call);
+ rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top, seq;
+ int ix;
+ u32 mtu, jmax;
+ u8 *ackp = pkt->acks;
+
+ /* Barrier against rxrpc_input_data(). */
+ serial = call->ackr_serial;
+ hard_ack = READ_ONCE(call->rx_hard_ack);
+ top = smp_load_acquire(&call->rx_top);
+ *_hard_ack = hard_ack;
+ *_top = top;
+
+ pkt->ack.bufferSpace = htons(8);
+ pkt->ack.maxSkew = htons(call->ackr_skew);
+ pkt->ack.firstPacket = htonl(hard_ack + 1);
+ pkt->ack.previousPacket = htonl(call->ackr_prev_seq);
+ pkt->ack.serial = htonl(serial);
+ pkt->ack.reason = reason;
+ pkt->ack.nAcks = top - hard_ack;
+
+ if (reason == RXRPC_ACK_PING)
+ pkt->whdr.flags |= RXRPC_REQUEST_ACK;
+
+ if (after(top, hard_ack)) {
+ seq = hard_ack + 1;
+ do {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ if (call->rxtx_buffer[ix])
+ *ackp++ = RXRPC_ACK_TYPE_ACK;
+ else
+ *ackp++ = RXRPC_ACK_TYPE_NACK;
+ seq++;
+ } while (before_eq(seq, top));
}
- write_unlock_bh(&call->state_lock);
-}
-
-/*
- * Create a new client call for sendmsg().
- */
-static struct rxrpc_call *
-rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
- unsigned long user_call_ID, bool exclusive)
-{
- struct rxrpc_conn_parameters cp;
- struct rxrpc_call *call;
- struct key *key;
-
- DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name);
-
- _enter("");
-
- if (!msg->msg_name)
- return ERR_PTR(-EDESTADDRREQ);
-
- key = rx->key;
- if (key && !rx->key->payload.data[0])
- key = NULL;
-
- memset(&cp, 0, sizeof(cp));
- cp.local = rx->local;
- cp.key = rx->key;
- cp.security_level = rx->min_sec_level;
- cp.exclusive = rx->exclusive | exclusive;
- cp.service_id = srx->srx_service;
- call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
-
- _leave(" = %p\n", call);
- return call;
+ mtu = call->conn->params.peer->if_mtu;
+ mtu -= call->conn->params.peer->hdrsize;
+ jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max;
+ pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
+ pkt->ackinfo.maxMTU = htonl(mtu);
+ pkt->ackinfo.rwind = htonl(call->rx_winsize);
+ pkt->ackinfo.jumbo_max = htonl(jmax);
+
+ *ackp++ = 0;
+ *ackp++ = 0;
+ *ackp++ = 0;
+ return top - hard_ack + 3;
}
/*
- * send a message forming part of a client call through an RxRPC socket
- * - caller holds the socket locked
- * - the socket may be either a client socket or a server socket
+ * Send an ACK call packet.
*/
-int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
{
- enum rxrpc_command cmd;
- struct rxrpc_call *call;
- unsigned long user_call_ID = 0;
- bool exclusive = false;
- u32 abort_code = 0;
+ struct rxrpc_connection *conn = NULL;
+ struct rxrpc_ack_buffer *pkt;
+ struct msghdr msg;
+ struct kvec iov[2];
+ rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top;
+ size_t len, n;
int ret;
-
- _enter("");
-
- ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
- &exclusive);
- if (ret < 0)
- return ret;
-
- if (cmd == RXRPC_CMD_ACCEPT) {
- if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
- return -EINVAL;
- call = rxrpc_accept_call(rx, user_call_ID);
- if (IS_ERR(call))
- return PTR_ERR(call);
- rxrpc_put_call(call);
- return 0;
+ u8 reason;
+
+ spin_lock_bh(&call->lock);
+ if (call->conn)
+ conn = rxrpc_get_connection_maybe(call->conn);
+ spin_unlock_bh(&call->lock);
+ if (!conn)
+ return -ECONNRESET;
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt) {
+ rxrpc_put_connection(conn);
+ return -ENOMEM;
}
- call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
- if (!call) {
- if (cmd != RXRPC_CMD_SEND_DATA)
- return -EBADSLT;
- call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
- exclusive);
- if (IS_ERR(call))
- return PTR_ERR(call);
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ pkt->whdr.epoch = htonl(conn->proto.epoch);
+ pkt->whdr.cid = htonl(call->cid);
+ pkt->whdr.callNumber = htonl(call->call_id);
+ pkt->whdr.seq = 0;
+ pkt->whdr.type = RXRPC_PACKET_TYPE_ACK;
+ pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag;
+ pkt->whdr.userStatus = 0;
+ pkt->whdr.securityIndex = call->security_ix;
+ pkt->whdr._rsvd = 0;
+ pkt->whdr.serviceId = htons(call->service_id);
+
+ spin_lock_bh(&call->lock);
+ if (ping) {
+ reason = RXRPC_ACK_PING;
+ } else {
+ reason = call->ackr_reason;
+ if (!call->ackr_reason) {
+ spin_unlock_bh(&call->lock);
+ ret = 0;
+ goto out;
+ }
+ call->ackr_reason = 0;
+ }
+ n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top, reason);
+
+ spin_unlock_bh(&call->lock);
+
+ iov[0].iov_base = pkt;
+ iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n;
+ iov[1].iov_base = &pkt->ackinfo;
+ iov[1].iov_len = sizeof(pkt->ackinfo);
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ serial = atomic_inc_return(&conn->serial);
+ pkt->whdr.serial = htonl(serial);
+ trace_rxrpc_tx_ack(call, serial,
+ ntohl(pkt->ack.firstPacket),
+ ntohl(pkt->ack.serial),
+ pkt->ack.reason, pkt->ack.nAcks);
+
+ if (ping) {
+ call->ping_serial = serial;
+ smp_wmb();
+ /* We need to stick a time in before we send the packet in case
+ * the reply gets back before kernel_sendmsg() completes - but
+ * asking UDP to send the packet can take a relatively long
+ * time, so we update the time after, on the assumption that
+ * the packet transmission is more likely to happen towards the
+ * end of the kernel_sendmsg() call.
+ */
+ call->ping_time = ktime_get_real();
+ set_bit(RXRPC_CALL_PINGING, &call->flags);
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
}
- _debug("CALL %d USR %lx ST %d on CONN %p",
- call->debug_id, call->user_call_ID, call->state, call->conn);
-
- if (call->state >= RXRPC_CALL_COMPLETE) {
- /* it's too late for this call */
- ret = -ECONNRESET;
- } else if (cmd == RXRPC_CMD_SEND_ABORT) {
- rxrpc_send_abort(call, abort_code);
- ret = 0;
- } else if (cmd != RXRPC_CMD_SEND_DATA) {
- ret = -EINVAL;
- } else if (!call->in_clientflag &&
- call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
- /* request phase complete for this client call */
- ret = -EPROTO;
- } else if (call->in_clientflag &&
- call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
- call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
- /* Reply phase not begun or not complete for service call. */
- ret = -EPROTO;
- } else {
- ret = rxrpc_send_data(rx, call, msg, len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+ if (ping)
+ call->ping_time = ktime_get_real();
+
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ if (ret < 0) {
+ if (ping)
+ clear_bit(RXRPC_CALL_PINGING, &call->flags);
+ rxrpc_propose_ACK(call, pkt->ack.reason,
+ ntohs(pkt->ack.maxSkew),
+ ntohl(pkt->ack.serial),
+ true, true,
+ rxrpc_propose_ack_retry_tx);
+ } else {
+ spin_lock_bh(&call->lock);
+ if (after(hard_ack, call->ackr_consumed))
+ call->ackr_consumed = hard_ack;
+ if (after(top, call->ackr_seen))
+ call->ackr_seen = top;
+ spin_unlock_bh(&call->lock);
+ }
}
- rxrpc_put_call(call);
- _leave(" = %d", ret);
+out:
+ rxrpc_put_connection(conn);
+ kfree(pkt);
return ret;
}
-/**
- * rxrpc_kernel_send_data - Allow a kernel service to send data on a call
- * @call: The call to send data through
- * @msg: The data to send
- * @len: The amount of data to send
- *
- * Allow a kernel service to send data on a call. The call must be in an state
- * appropriate to sending data. No control data should be supplied in @msg,
- * nor should an address be supplied. MSG_MORE should be flagged if there's
- * more data to come, otherwise this data will end the transmission phase.
+/*
+ * Send an ABORT call packet.
*/
-int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg,
- size_t len)
+int rxrpc_send_abort_packet(struct rxrpc_call *call)
{
+ struct rxrpc_connection *conn = NULL;
+ struct rxrpc_abort_buffer pkt;
+ struct msghdr msg;
+ struct kvec iov[1];
+ rxrpc_serial_t serial;
int ret;
- _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]);
-
- ASSERTCMP(msg->msg_name, ==, NULL);
- ASSERTCMP(msg->msg_control, ==, NULL);
-
- lock_sock(&call->socket->sk);
-
- _debug("CALL %d USR %lx ST %d on CONN %p",
- call->debug_id, call->user_call_ID, call->state, call->conn);
+ spin_lock_bh(&call->lock);
+ if (call->conn)
+ conn = rxrpc_get_connection_maybe(call->conn);
+ spin_unlock_bh(&call->lock);
+ if (!conn)
+ return -ECONNRESET;
- if (call->state >= RXRPC_CALL_COMPLETE) {
- ret = -ESHUTDOWN; /* it's too late for this call */
- } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
- call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
- call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
- ret = -EPROTO; /* request phase complete for this client call */
- } else {
- ret = rxrpc_send_data(call->socket, call, msg, len);
- }
-
- release_sock(&call->socket->sk);
- _leave(" = %d", ret);
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ pkt.whdr.epoch = htonl(conn->proto.epoch);
+ pkt.whdr.cid = htonl(call->cid);
+ pkt.whdr.callNumber = htonl(call->call_id);
+ pkt.whdr.seq = 0;
+ pkt.whdr.type = RXRPC_PACKET_TYPE_ABORT;
+ pkt.whdr.flags = conn->out_clientflag;
+ pkt.whdr.userStatus = 0;
+ pkt.whdr.securityIndex = call->security_ix;
+ pkt.whdr._rsvd = 0;
+ pkt.whdr.serviceId = htons(call->service_id);
+ pkt.abort_code = htonl(call->abort_code);
+
+ iov[0].iov_base = &pkt;
+ iov[0].iov_len = sizeof(pkt);
+
+ serial = atomic_inc_return(&conn->serial);
+ pkt.whdr.serial = htonl(serial);
+
+ ret = kernel_sendmsg(conn->params.local->socket,
+ &msg, iov, 1, sizeof(pkt));
+
+ rxrpc_put_connection(conn);
return ret;
}
-EXPORT_SYMBOL(rxrpc_kernel_send_data);
-
-/**
- * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
- * @call: The call to be aborted
- * @abort_code: The abort code to stick into the ABORT packet
- *
- * Allow a kernel service to abort a call, if it's still in an abortable state.
- */
-void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code)
-{
- _enter("{%d},%d", call->debug_id, abort_code);
-
- lock_sock(&call->socket->sk);
-
- _debug("CALL %d USR %lx ST %d on CONN %p",
- call->debug_id, call->user_call_ID, call->state, call->conn);
-
- if (call->state < RXRPC_CALL_COMPLETE)
- rxrpc_send_abort(call, abort_code);
-
- release_sock(&call->socket->sk);
- _leave("");
-}
-
-EXPORT_SYMBOL(rxrpc_kernel_abort_call);
-
/*
* send a packet through the transport endpoint
*/
-int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ bool retrans)
{
- struct kvec iov[1];
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct msghdr msg;
+ struct kvec iov[2];
+ rxrpc_serial_t serial;
+ size_t len;
+ bool lost = false;
int ret, opt;
_enter(",{%d}", skb->len);
- iov[0].iov_base = skb->head;
- iov[0].iov_len = skb->len;
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = atomic_inc_return(&conn->serial);
+
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(call->cid);
+ whdr.callNumber = htonl(call->call_id);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = htonl(serial);
+ whdr.type = RXRPC_PACKET_TYPE_DATA;
+ whdr.flags = sp->hdr.flags;
+ whdr.userStatus = 0;
+ whdr.securityIndex = call->security_ix;
+ whdr._rsvd = htons(sp->hdr._rsvd);
+ whdr.serviceId = htons(call->service_id);
- msg.msg_name = &conn->params.peer->srx.transport;
- msg.msg_namelen = conn->params.peer->srx.transport_len;
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = skb->head;
+ iov[1].iov_len = skb->len;
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- /* send the packet with the don't fragment bit set if we currently
- * think it's small enough */
- if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
- down_read(&conn->params.local->defrag_sem);
- /* send the packet by UDP
- * - returns -EMSGSIZE if UDP would have to fragment the packet
- * to go out of the interface
- * - in which case, we'll have processed the ICMP error
- * message and update the peer record
- */
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ /* If our RTT cache needs working on, request an ACK. Also request
+ * ACKs if a DATA packet appears to have been lost.
+ */
+ if (!(sp->hdr.flags & RXRPC_LAST_PACKET) &&
+ (retrans ||
+ call->cong_mode == RXRPC_CALL_SLOW_START ||
+ (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ ktime_get_real())))
+ whdr.flags |= RXRPC_REQUEST_ACK;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+ if ((lose++ & 7) == 7) {
+ ret = 0;
+ lost = true;
+ goto done;
+ }
+ }
- up_read(&conn->params.local->defrag_sem);
- if (ret == -EMSGSIZE)
- goto send_fragmentable;
+ _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
- _leave(" = %d [%u]", ret, conn->params.peer->maxdata);
- return ret;
+ /* send the packet with the don't fragment bit set if we currently
+ * think it's small enough */
+ if (iov[1].iov_len >= call->peer->maxdata)
+ goto send_fragmentable;
+
+ down_read(&conn->params.local->defrag_sem);
+ /* send the packet by UDP
+ * - returns -EMSGSIZE if UDP would have to fragment the packet
+ * to go out of the interface
+ * - in which case, we'll have processed the ICMP error
+ * message and update the peer record
+ */
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+
+ up_read(&conn->params.local->defrag_sem);
+ if (ret == -EMSGSIZE)
+ goto send_fragmentable;
+
+done:
+ trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
+ retrans, lost);
+ if (ret >= 0) {
+ ktime_t now = ktime_get_real();
+ skb->tstamp = now;
+ smp_wmb();
+ sp->hdr.serial = serial;
+ if (whdr.flags & RXRPC_REQUEST_ACK) {
+ call->peer->rtt_last_req = now;
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
+ }
}
+ _leave(" = %d [%u]", ret, call->peer->maxdata);
+ return ret;
send_fragmentable:
/* attempt to send this message with fragmentation enabled */
@@ -358,8 +373,8 @@ send_fragmentable:
SOL_IP, IP_MTU_DISCOVER,
(char *)&opt, sizeof(opt));
if (ret == 0) {
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg,
+ iov, 2, len);
opt = IP_PMTUDISC_DO;
kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -367,355 +382,82 @@ send_fragmentable:
(char *)&opt, sizeof(opt));
}
break;
- }
-
- up_write(&conn->params.local->defrag_sem);
- _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
- return ret;
-}
-/*
- * wait for space to appear in the transmit/ACK window
- * - caller holds the socket locked
- */
-static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
- struct rxrpc_call *call,
- long *timeo)
-{
- DECLARE_WAITQUEUE(myself, current);
- int ret;
-
- _enter(",{%d},%ld",
- CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
- call->acks_winsz),
- *timeo);
-
- add_wait_queue(&call->tx_waitq, &myself);
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- ret = 0;
- if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
- call->acks_winsz) > 0)
- break;
- if (signal_pending(current)) {
- ret = sock_intr_errno(*timeo);
- break;
- }
-
- release_sock(&rx->sk);
- *timeo = schedule_timeout(*timeo);
- lock_sock(&rx->sk);
- }
-
- remove_wait_queue(&call->tx_waitq, &myself);
- set_current_state(TASK_RUNNING);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * attempt to schedule an instant Tx resend
- */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call)
-{
- read_lock_bh(&call->state_lock);
- if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
- rxrpc_queue_call(call);
- }
- read_unlock_bh(&call->state_lock);
-}
-
-/*
- * queue a packet for transmission, set the resend timer and attempt
- * to send the packet immediately
- */
-static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
- bool last)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- int ret;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ opt = IPV6_PMTUDISC_DONT;
+ ret = kernel_setsockopt(conn->params.local->socket,
+ SOL_IPV6, IPV6_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ if (ret == 0) {
+ ret = kernel_sendmsg(conn->params.local->socket, &msg,
+ iov, 1, iov[0].iov_len);
- _net("queue skb %p [%d]", skb, call->acks_head);
-
- ASSERT(call->acks_window != NULL);
- call->acks_window[call->acks_head] = (unsigned long) skb;
- smp_wmb();
- call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
-
- if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
- _debug("________awaiting reply/ACK__________");
- write_lock_bh(&call->state_lock);
- switch (call->state) {
- case RXRPC_CALL_CLIENT_SEND_REQUEST:
- call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
- break;
- case RXRPC_CALL_SERVER_ACK_REQUEST:
- call->state = RXRPC_CALL_SERVER_SEND_REPLY;
- if (!last)
- break;
- case RXRPC_CALL_SERVER_SEND_REPLY:
- call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
- break;
- default:
- break;
+ opt = IPV6_PMTUDISC_DO;
+ kernel_setsockopt(conn->params.local->socket,
+ SOL_IPV6, IPV6_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
}
- write_unlock_bh(&call->state_lock);
- }
-
- _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
- sp->need_resend = false;
- sp->resend_at = jiffies + rxrpc_resend_timeout;
- if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
- _debug("run timer");
- call->resend_timer.expires = sp->resend_at;
- add_timer(&call->resend_timer);
- }
-
- /* attempt to cancel the rx-ACK timer, deferring reply transmission if
- * we're ACK'ing the request phase of an incoming call */
- ret = -EAGAIN;
- if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
- /* the packet may be freed by rxrpc_process_call() before this
- * returns */
- ret = rxrpc_send_data_packet(call->conn, skb);
- _net("sent skb %p", skb);
- } else {
- _debug("failed to delete ACK timer");
- }
-
- if (ret < 0) {
- _debug("need instant resend %d", ret);
- sp->need_resend = true;
- rxrpc_instant_resend(call);
+ break;
+#endif
}
- _leave("");
-}
-
-/*
- * Convert a host-endian header into a network-endian header.
- */
-static void rxrpc_insert_header(struct sk_buff *skb)
-{
- struct rxrpc_wire_header whdr;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = htonl(sp->hdr.serial);
- whdr.type = sp->hdr.type;
- whdr.flags = sp->hdr.flags;
- whdr.userStatus = sp->hdr.userStatus;
- whdr.securityIndex = sp->hdr.securityIndex;
- whdr._rsvd = htons(sp->hdr._rsvd);
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- memcpy(skb->head, &whdr, sizeof(whdr));
+ up_write(&conn->params.local->defrag_sem);
+ goto done;
}
/*
- * send data through a socket
- * - must be called in process context
- * - caller holds the socket locked
+ * reject packets through the local endpoint
*/
-static int rxrpc_send_data(struct rxrpc_sock *rx,
- struct rxrpc_call *call,
- struct msghdr *msg, size_t len)
+void rxrpc_reject_packets(struct rxrpc_local *local)
{
+ struct sockaddr_rxrpc srx;
struct rxrpc_skb_priv *sp;
+ struct rxrpc_wire_header whdr;
struct sk_buff *skb;
- struct sock *sk = &rx->sk;
- long timeo;
- bool more;
- int ret, copied;
-
- timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-
- /* this should be in poll */
- sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- return -EPIPE;
-
- more = msg->msg_flags & MSG_MORE;
-
- skb = call->tx_pending;
- call->tx_pending = NULL;
-
- copied = 0;
- do {
- if (!skb) {
- size_t size, chunk, max, space;
-
- _debug("alloc");
-
- if (CIRC_SPACE(call->acks_head,
- ACCESS_ONCE(call->acks_tail),
- call->acks_winsz) <= 0) {
- ret = -EAGAIN;
- if (msg->msg_flags & MSG_DONTWAIT)
- goto maybe_error;
- ret = rxrpc_wait_for_tx_window(rx, call,
- &timeo);
- if (ret < 0)
- goto maybe_error;
- }
-
- max = call->conn->params.peer->maxdata;
- max -= call->conn->security_size;
- max &= ~(call->conn->size_align - 1UL);
-
- chunk = max;
- if (chunk > msg_data_left(msg) && !more)
- chunk = msg_data_left(msg);
-
- space = chunk + call->conn->size_align;
- space &= ~(call->conn->size_align - 1UL);
-
- size = space + call->conn->header_size;
-
- _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
-
- /* create a buffer that we can retain until it's ACK'd */
- skb = sock_alloc_send_skb(
- sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
- if (!skb)
- goto maybe_error;
-
- rxrpc_new_skb(skb);
-
- _debug("ALLOC SEND %p", skb);
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t size;
+ __be32 code;
- ASSERTCMP(skb->mark, ==, 0);
+ _enter("%d", local->debug_id);
- _debug("HS: %u", call->conn->header_size);
- skb_reserve(skb, call->conn->header_size);
- skb->len += call->conn->header_size;
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = &code;
+ iov[1].iov_len = sizeof(code);
+ size = sizeof(whdr) + sizeof(code);
- sp = rxrpc_skb(skb);
- sp->remain = chunk;
- if (sp->remain > skb_tailroom(skb))
- sp->remain = skb_tailroom(skb);
+ msg.msg_name = &srx.transport;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
- _net("skb: hr %d, tr %d, hl %d, rm %d",
- skb_headroom(skb),
- skb_tailroom(skb),
- skb_headlen(skb),
- sp->remain);
+ memset(&whdr, 0, sizeof(whdr));
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
-
- _debug("append");
+ while ((skb = skb_dequeue(&local->reject_queue))) {
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
sp = rxrpc_skb(skb);
- /* append next segment of data to the current buffer */
- if (msg_data_left(msg) > 0) {
- int copy = skb_tailroom(skb);
- ASSERTCMP(copy, >, 0);
- if (copy > msg_data_left(msg))
- copy = msg_data_left(msg);
- if (copy > sp->remain)
- copy = sp->remain;
-
- _debug("add");
- ret = skb_add_data(skb, &msg->msg_iter, copy);
- _debug("added");
- if (ret < 0)
- goto efault;
- sp->remain -= copy;
- skb->mark += copy;
- copied += copy;
- }
+ if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
+ msg.msg_namelen = srx.transport_len;
- /* check for the far side aborting the call or a network error
- * occurring */
- if (call->state > RXRPC_CALL_COMPLETE)
- goto call_aborted;
-
- /* add the packet to the send queue if it's now full */
- if (sp->remain <= 0 ||
- (msg_data_left(msg) == 0 && !more)) {
- struct rxrpc_connection *conn = call->conn;
- uint32_t seq;
- size_t pad;
-
- /* pad out if we're using security */
- if (conn->security_ix) {
- pad = conn->security_size + skb->mark;
- pad = conn->size_align - pad;
- pad &= conn->size_align - 1;
- _debug("pad %zu", pad);
- if (pad)
- memset(skb_put(skb, pad), 0, pad);
- }
-
- seq = atomic_inc_return(&call->sequence);
-
- sp->hdr.epoch = conn->proto.epoch;
- sp->hdr.cid = call->cid;
- sp->hdr.callNumber = call->call_id;
- sp->hdr.seq = seq;
- sp->hdr.serial = atomic_inc_return(&conn->serial);
- sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
- sp->hdr.userStatus = 0;
- sp->hdr.securityIndex = conn->security_ix;
- sp->hdr._rsvd = 0;
- sp->hdr.serviceId = call->service_id;
-
- sp->hdr.flags = conn->out_clientflag;
- if (msg_data_left(msg) == 0 && !more)
- sp->hdr.flags |= RXRPC_LAST_PACKET;
- else if (CIRC_SPACE(call->acks_head,
- ACCESS_ONCE(call->acks_tail),
- call->acks_winsz) > 1)
- sp->hdr.flags |= RXRPC_MORE_PACKETS;
- if (more && seq & 1)
- sp->hdr.flags |= RXRPC_REQUEST_ACK;
-
- ret = conn->security->secure_packet(
- call, skb, skb->mark,
- skb->head + sizeof(struct rxrpc_wire_header));
- if (ret < 0)
- goto out;
-
- rxrpc_insert_header(skb);
- rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
- skb = NULL;
- }
- } while (msg_data_left(msg) > 0);
+ code = htonl(skb->priority);
-success:
- ret = copied;
-out:
- call->tx_pending = skb;
- _leave(" = %d", ret);
- return ret;
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.serviceId = htons(sp->hdr.serviceId);
+ whdr.flags = sp->hdr.flags;
+ whdr.flags ^= RXRPC_CLIENT_INITIATED;
+ whdr.flags &= RXRPC_CLIENT_INITIATED;
-call_aborted:
- rxrpc_free_skb(skb);
- if (call->state == RXRPC_CALL_NETWORK_ERROR)
- ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ?
- call->error_report :
- call->error_report - RXRPC_LOCAL_ERROR_OFFSET;
- else
- ret = -ECONNABORTED;
- _leave(" = %d", ret);
- return ret;
+ kernel_sendmsg(local->socket, &msg, iov, 2, size);
+ }
-maybe_error:
- if (copied)
- goto success;
- goto out;
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ }
-efault:
- ret = -EFAULT;
- goto out;
+ _leave("");
}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 8940674b5e08..bf13b8470c9a 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -66,6 +66,32 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
}
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ srx.transport.sin6.sin6_port = serr->port;
+ srx.transport_len = sizeof(struct sockaddr_in6);
+ switch (serr->ee.ee_origin) {
+ case SO_EE_ORIGIN_ICMP6:
+ _net("Rx ICMP6");
+ memcpy(&srx.transport.sin6.sin6_addr,
+ skb_network_header(skb) + serr->addr_offset,
+ sizeof(struct in6_addr));
+ break;
+ case SO_EE_ORIGIN_ICMP:
+ _net("Rx ICMP on v6 sock");
+ memcpy(srx.transport.sin6.sin6_addr.s6_addr + 12,
+ skb_network_header(skb) + serr->addr_offset,
+ sizeof(struct in_addr));
+ break;
+ default:
+ memcpy(&srx.transport.sin6.sin6_addr,
+ &ipv6_hdr(skb)->saddr,
+ sizeof(struct in6_addr));
+ break;
+ }
+ break;
+#endif
+
default:
BUG();
}
@@ -129,22 +155,21 @@ void rxrpc_error_report(struct sock *sk)
_leave("UDP socket errqueue empty");
return;
}
+ rxrpc_new_skb(skb, rxrpc_skb_rx_received);
serr = SKB_EXT_ERR(skb);
if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
_leave("UDP empty message");
- kfree_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
return;
}
- rxrpc_new_skb(skb);
-
rcu_read_lock();
peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
if (peer && !rxrpc_get_peer_maybe(peer))
peer = NULL;
if (!peer) {
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
_leave(" [no peer]");
return;
}
@@ -154,7 +179,7 @@ void rxrpc_error_report(struct sock *sk)
serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
rxrpc_adjust_mtu(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
rxrpc_put_peer(peer);
_leave(" [MTU update]");
return;
@@ -162,7 +187,7 @@ void rxrpc_error_report(struct sock *sk)
rxrpc_store_error(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
/* The ref we obtained is passed off to the work item */
rxrpc_queue_work(&peer->error_distributor);
@@ -248,13 +273,20 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
struct rxrpc_peer *peer =
container_of(work, struct rxrpc_peer, error_distributor);
struct rxrpc_call *call;
- int error_report;
+ enum rxrpc_call_completion compl;
+ int error;
_enter("");
- error_report = READ_ONCE(peer->error_report);
+ error = READ_ONCE(peer->error_report);
+ if (error < RXRPC_LOCAL_ERROR_OFFSET) {
+ compl = RXRPC_CALL_NETWORK_ERROR;
+ } else {
+ compl = RXRPC_CALL_LOCAL_ERROR;
+ error -= RXRPC_LOCAL_ERROR_OFFSET;
+ }
- _debug("ISSUE ERROR %d", error_report);
+ _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error);
spin_lock_bh(&peer->lock);
@@ -262,16 +294,10 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
call = hlist_entry(peer->error_targets.first,
struct rxrpc_call, error_link);
hlist_del_init(&call->error_link);
+ rxrpc_see_call(call);
- write_lock(&call->state_lock);
- if (call->state != RXRPC_CALL_COMPLETE &&
- call->state < RXRPC_CALL_NETWORK_ERROR) {
- call->error_report = error_report;
- call->state = RXRPC_CALL_NETWORK_ERROR;
- set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
- rxrpc_queue_call(call);
- }
- write_unlock(&call->state_lock);
+ if (rxrpc_set_call_completion(call, compl, 0, error))
+ rxrpc_notify_socket(call);
}
spin_unlock_bh(&peer->lock);
@@ -279,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
rxrpc_put_peer(peer);
_leave("");
}
+
+/*
+ * Add RTT information to cache. This is called in softirq mode and has
+ * exclusive access to the peer RTT data.
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+ rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+ ktime_t send_time, ktime_t resp_time)
+{
+ struct rxrpc_peer *peer = call->peer;
+ s64 rtt;
+ u64 sum = peer->rtt_sum, avg;
+ u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
+
+ rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
+ if (rtt < 0)
+ return;
+
+ /* Replace the oldest datum in the RTT buffer */
+ sum -= peer->rtt_cache[cursor];
+ sum += rtt;
+ peer->rtt_cache[cursor] = rtt;
+ peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
+ peer->rtt_sum = sum;
+ if (usage < RXRPC_RTT_CACHE_SIZE) {
+ usage++;
+ peer->rtt_usage = usage;
+ }
+
+ /* Now recalculate the average */
+ if (usage == RXRPC_RTT_CACHE_SIZE) {
+ avg = sum / RXRPC_RTT_CACHE_SIZE;
+ } else {
+ avg = sum;
+ do_div(avg, usage);
+ }
+
+ peer->rtt = avg;
+ trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
+ usage, avg);
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 538e9831c699..862eea6b266c 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -16,12 +16,14 @@
#include <linux/skbuff.h>
#include <linux/udp.h>
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
#include <net/route.h>
+#include <net/ip6_route.h>
#include "ar-internal.h"
static DEFINE_HASHTABLE(rxrpc_peer_hash, 10);
@@ -50,6 +52,13 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
size = sizeof(srx->transport.sin.sin_addr);
p = (u16 *)&srx->transport.sin.sin_addr;
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ hash_key += (u16 __force)srx->transport.sin.sin_port;
+ size = sizeof(srx->transport.sin6.sin6_addr);
+ p = (u16 *)&srx->transport.sin6.sin6_addr;
+ break;
+#endif
default:
WARN(1, "AF_RXRPC: Unsupported transport address family\n");
return 0;
@@ -93,6 +102,14 @@ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
memcmp(&peer->srx.transport.sin.sin_addr,
&srx->transport.sin.sin_addr,
sizeof(struct in_addr));
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ return ((u16 __force)peer->srx.transport.sin6.sin6_port -
+ (u16 __force)srx->transport.sin6.sin6_port) ?:
+ memcmp(&peer->srx.transport.sin6.sin6_addr,
+ &srx->transport.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+#endif
default:
BUG();
}
@@ -130,17 +147,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
if (peer) {
- switch (srx->transport.family) {
- case AF_INET:
- _net("PEER %d {%d,%u,%pI4+%hu}",
- peer->debug_id,
- peer->srx.transport_type,
- peer->srx.transport.family,
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
- break;
- }
-
+ _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
_leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
}
return peer;
@@ -152,22 +159,53 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
*/
static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
{
+ struct dst_entry *dst;
struct rtable *rt;
- struct flowi4 fl4;
+ struct flowi fl;
+ struct flowi4 *fl4 = &fl.u.ip4;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ struct flowi6 *fl6 = &fl.u.ip6;
+#endif
peer->if_mtu = 1500;
- rt = ip_route_output_ports(&init_net, &fl4, NULL,
- peer->srx.transport.sin.sin_addr.s_addr, 0,
- htons(7000), htons(7001),
- IPPROTO_UDP, 0, 0);
- if (IS_ERR(rt)) {
- _leave(" [route err %ld]", PTR_ERR(rt));
- return;
+ memset(&fl, 0, sizeof(fl));
+ switch (peer->srx.transport.family) {
+ case AF_INET:
+ rt = ip_route_output_ports(
+ &init_net, fl4, NULL,
+ peer->srx.transport.sin.sin_addr.s_addr, 0,
+ htons(7000), htons(7001), IPPROTO_UDP, 0, 0);
+ if (IS_ERR(rt)) {
+ _leave(" [route err %ld]", PTR_ERR(rt));
+ return;
+ }
+ dst = &rt->dst;
+ break;
+
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ fl6->flowi6_iif = LOOPBACK_IFINDEX;
+ fl6->flowi6_scope = RT_SCOPE_UNIVERSE;
+ fl6->flowi6_proto = IPPROTO_UDP;
+ memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+ fl6->fl6_dport = htons(7001);
+ fl6->fl6_sport = htons(7000);
+ dst = ip6_route_output(&init_net, NULL, fl6);
+ if (dst->error) {
+ _leave(" [route err %d]", dst->error);
+ return;
+ }
+ break;
+#endif
+
+ default:
+ BUG();
}
- peer->if_mtu = dst_mtu(&rt->dst);
- dst_release(&rt->dst);
+ peer->if_mtu = dst_mtu(dst);
+ dst_release(dst);
_leave(" [if_mtu %u]", peer->if_mtu);
}
@@ -199,6 +237,41 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
}
/*
+ * Initialise peer record.
+ */
+static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
+{
+ peer->hash_key = hash_key;
+ rxrpc_assess_MTU_size(peer);
+ peer->mtu = peer->if_mtu;
+ peer->rtt_last_req = ktime_get_real();
+
+ switch (peer->srx.transport.family) {
+ case AF_INET:
+ peer->hdrsize = sizeof(struct iphdr);
+ break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ peer->hdrsize = sizeof(struct ipv6hdr);
+ break;
+#endif
+ default:
+ BUG();
+ }
+
+ switch (peer->srx.transport_type) {
+ case SOCK_DGRAM:
+ peer->hdrsize += sizeof(struct udphdr);
+ break;
+ default:
+ BUG();
+ }
+
+ peer->hdrsize += sizeof(struct rxrpc_wire_header);
+ peer->maxdata = peer->mtu - peer->hdrsize;
+}
+
+/*
* Set up a new peer.
*/
static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
@@ -212,31 +285,40 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
peer = rxrpc_alloc_peer(local, gfp);
if (peer) {
- peer->hash_key = hash_key;
memcpy(&peer->srx, srx, sizeof(*srx));
+ rxrpc_init_peer(peer, hash_key);
+ }
- rxrpc_assess_MTU_size(peer);
- peer->mtu = peer->if_mtu;
-
- if (srx->transport.family == AF_INET) {
- peer->hdrsize = sizeof(struct iphdr);
- switch (srx->transport_type) {
- case SOCK_DGRAM:
- peer->hdrsize += sizeof(struct udphdr);
- break;
- default:
- BUG();
- break;
- }
- } else {
- BUG();
- }
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * Set up a new incoming peer. The address is prestored in the preallocated
+ * peer.
+ */
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
+ struct rxrpc_peer *prealloc)
+{
+ struct rxrpc_peer *peer;
+ unsigned long hash_key;
+
+ hash_key = rxrpc_peer_hash_key(local, &prealloc->srx);
+ prealloc->local = local;
+ rxrpc_init_peer(prealloc, hash_key);
- peer->hdrsize += sizeof(struct rxrpc_wire_header);
- peer->maxdata = peer->mtu - peer->hdrsize;
+ spin_lock(&rxrpc_peer_hash_lock);
+
+ /* Need to check that we aren't racing with someone else */
+ peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer) {
+ peer = prealloc;
+ hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key);
}
- _leave(" = %p", peer);
+ spin_unlock(&rxrpc_peer_hash_lock);
return peer;
}
@@ -249,11 +331,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
struct rxrpc_peer *peer, *candidate;
unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
- _enter("{%d,%d,%pI4+%hu}",
- srx->transport_type,
- srx->transport_len,
- &srx->transport.sin.sin_addr,
- ntohs(srx->transport.sin.sin_port));
+ _enter("{%pISp}", &srx->transport);
/* search the peer list first */
rcu_read_lock();
@@ -272,7 +350,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
return NULL;
}
- spin_lock(&rxrpc_peer_hash_lock);
+ spin_lock_bh(&rxrpc_peer_hash_lock);
/* Need to check that we aren't racing with someone else */
peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
@@ -282,7 +360,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
hash_add_rcu(rxrpc_peer_hash,
&candidate->hash_link, hash_key);
- spin_unlock(&rxrpc_peer_hash_lock);
+ spin_unlock_bh(&rxrpc_peer_hash_lock);
if (peer)
kfree(candidate);
@@ -290,11 +368,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
peer = candidate;
}
- _net("PEER %d {%d,%pI4+%hu}",
- peer->debug_id,
- peer->srx.transport_type,
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
+ _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
_leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
return peer;
@@ -307,9 +381,24 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
{
ASSERT(hlist_empty(&peer->error_targets));
- spin_lock(&rxrpc_peer_hash_lock);
+ spin_lock_bh(&rxrpc_peer_hash_lock);
hash_del_rcu(&peer->hash_link);
- spin_unlock(&rxrpc_peer_hash_lock);
+ spin_unlock_bh(&rxrpc_peer_hash_lock);
kfree_rcu(peer, rcu);
}
+
+/**
+ * rxrpc_kernel_get_peer - Get the peer address of a call
+ * @sock: The socket on which the call is in progress.
+ * @call: The call to query
+ * @_srx: Where to place the result
+ *
+ * Get the address of the remote peer in a call.
+ */
+void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
+ struct sockaddr_rxrpc *_srx)
+{
+ *_srx = call->peer->srx;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_peer);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index ced5f07444e5..65cd980767fa 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -17,12 +17,12 @@
static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
[RXRPC_CONN_UNUSED] = "Unused ",
[RXRPC_CONN_CLIENT] = "Client ",
+ [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc",
[RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ",
[RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ",
[RXRPC_CONN_SERVICE] = "SvSecure",
[RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort",
[RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort",
- [RXRPC_CONN_NETWORK_ERROR] = "NetError",
};
/*
@@ -30,6 +30,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
*/
static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
{
+ rcu_read_lock();
read_lock(&rxrpc_call_lock);
return seq_list_start_head(&rxrpc_calls, *_pos);
}
@@ -42,17 +43,21 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
{
read_unlock(&rxrpc_call_lock);
+ rcu_read_unlock();
}
static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
{
- struct rxrpc_connection *conn;
+ struct rxrpc_local *local;
+ struct rxrpc_sock *rx;
+ struct rxrpc_peer *peer;
struct rxrpc_call *call;
- char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+ char lbuff[50], rbuff[50];
if (v == &rxrpc_calls) {
seq_puts(seq,
- "Proto Local Remote "
+ "Proto Local "
+ " Remote "
" SvID ConnID CallID End Use State Abort "
" UserID\n");
return 0;
@@ -60,30 +65,35 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
call = list_entry(v, struct rxrpc_call, link);
- sprintf(lbuff, "%pI4:%u",
- &call->local->srx.transport.sin.sin_addr,
- ntohs(call->local->srx.transport.sin.sin_port));
+ rx = rcu_dereference(call->socket);
+ if (rx) {
+ local = READ_ONCE(rx->local);
+ if (local)
+ sprintf(lbuff, "%pISpc", &local->srx.transport);
+ else
+ strcpy(lbuff, "no_local");
+ } else {
+ strcpy(lbuff, "no_socket");
+ }
- conn = call->conn;
- if (conn)
- sprintf(rbuff, "%pI4:%u",
- &conn->params.peer->srx.transport.sin.sin_addr,
- ntohs(conn->params.peer->srx.transport.sin.sin_port));
+ peer = call->peer;
+ if (peer)
+ sprintf(rbuff, "%pISpc", &peer->srx.transport);
else
strcpy(rbuff, "no_connection");
seq_printf(seq,
- "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
+ "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
" %-8.8s %08x %lx\n",
lbuff,
rbuff,
call->service_id,
call->cid,
call->call_id,
- call->in_clientflag ? "Svc" : "Clt",
+ rxrpc_is_service_call(call) ? "Svc" : "Clt",
atomic_read(&call->usage),
rxrpc_call_states[call->state],
- call->remote_abort ?: call->local_abort,
+ call->abort_code,
call->user_call_ID);
return 0;
@@ -115,13 +125,13 @@ const struct file_operations rxrpc_call_seq_fops = {
static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos)
{
read_lock(&rxrpc_connection_lock);
- return seq_list_start_head(&rxrpc_connections, *_pos);
+ return seq_list_start_head(&rxrpc_connection_proc_list, *_pos);
}
static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v,
loff_t *pos)
{
- return seq_list_next(v, &rxrpc_connections, pos);
+ return seq_list_next(v, &rxrpc_connection_proc_list, pos);
}
static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
@@ -132,29 +142,31 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
{
struct rxrpc_connection *conn;
- char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+ char lbuff[50], rbuff[50];
- if (v == &rxrpc_connections) {
+ if (v == &rxrpc_connection_proc_list) {
seq_puts(seq,
- "Proto Local Remote "
+ "Proto Local "
+ " Remote "
" SvID ConnID End Use State Key "
" Serial ISerial\n"
);
return 0;
}
- conn = list_entry(v, struct rxrpc_connection, link);
-
- sprintf(lbuff, "%pI4:%u",
- &conn->params.local->srx.transport.sin.sin_addr,
- ntohs(conn->params.local->srx.transport.sin.sin_port));
+ conn = list_entry(v, struct rxrpc_connection, proc_link);
+ if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) {
+ strcpy(lbuff, "no_local");
+ strcpy(rbuff, "no_connection");
+ goto print;
+ }
- sprintf(rbuff, "%pI4:%u",
- &conn->params.peer->srx.transport.sin.sin_addr,
- ntohs(conn->params.peer->srx.transport.sin.sin_port));
+ sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport);
+ sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport);
+print:
seq_printf(seq,
- "UDP %-22.22s %-22.22s %4x %08x %s %3u"
+ "UDP %-47.47s %-47.47s %4x %08x %s %3u"
" %s %08x %08x %08x\n",
lbuff,
rbuff,
@@ -165,7 +177,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
rxrpc_conn_states[conn->state],
key_serial(conn->params.key),
atomic_read(&conn->serial),
- atomic_read(&conn->hi_serial));
+ conn->hi_serial);
return 0;
}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 9ed66d533002..c29362d50a92 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -19,399 +19,649 @@
#include "ar-internal.h"
/*
- * removal a call's user ID from the socket tree to make the user ID available
- * again and so that it won't be seen again in association with that call
+ * Post a call for attention by the socket or kernel service. Further
+ * notifications are suppressed by putting recvmsg_link on a dummy queue.
*/
-void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
+void rxrpc_notify_socket(struct rxrpc_call *call)
{
- _debug("RELEASE CALL %d", call->debug_id);
+ struct rxrpc_sock *rx;
+ struct sock *sk;
+
+ _enter("%d", call->debug_id);
+
+ if (!list_empty(&call->recvmsg_link))
+ return;
+
+ rcu_read_lock();
+
+ rx = rcu_dereference(call->socket);
+ sk = &rx->sk;
+ if (rx && sk->sk_state < RXRPC_CLOSE) {
+ if (call->notify_rx) {
+ call->notify_rx(sk, call, call->user_call_ID);
+ } else {
+ write_lock_bh(&rx->recvmsg_lock);
+ if (list_empty(&call->recvmsg_link)) {
+ rxrpc_get_call(call, rxrpc_call_got);
+ list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
+ }
+ write_unlock_bh(&rx->recvmsg_lock);
- if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- write_lock_bh(&rx->call_lock);
- rb_erase(&call->sock_node, &call->socket->calls);
- clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
- write_unlock_bh(&rx->call_lock);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ _debug("call %ps", sk->sk_data_ready);
+ sk->sk_data_ready(sk);
+ }
+ }
}
- read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
+ rcu_read_unlock();
+ _leave("");
}
/*
- * receive a message from an RxRPC socket
- * - we need to be careful about two or more threads calling recvmsg
- * simultaneously
+ * Pass a call terminating message to userspace.
*/
-int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
- int flags)
+static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
{
- struct rxrpc_skb_priv *sp;
- struct rxrpc_call *call = NULL, *continue_call = NULL;
- struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- struct sk_buff *skb;
- long timeo;
- int copy, ret, ullen, offset, copied = 0;
- u32 abort_code;
+ u32 tmp = 0;
+ int ret;
+
+ switch (call->completion) {
+ case RXRPC_CALL_SUCCEEDED:
+ ret = 0;
+ if (rxrpc_is_service_call(call))
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp);
+ break;
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ tmp = call->abort_code;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+ break;
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ tmp = call->abort_code;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+ break;
+ case RXRPC_CALL_NETWORK_ERROR:
+ tmp = call->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp);
+ break;
+ case RXRPC_CALL_LOCAL_ERROR:
+ tmp = call->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp);
+ break;
+ default:
+ pr_err("Invalid terminal call state %u\n", call->state);
+ BUG();
+ break;
+ }
- DEFINE_WAIT(wait);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
+ call->rx_pkt_offset, call->rx_pkt_len, ret);
+ return ret;
+}
- _enter(",,,%zu,%d", len, flags);
+/*
+ * Pass back notification of a new call. The call is added to the
+ * to-be-accepted list. This means that the next call to be accepted might not
+ * be the last call seen awaiting acceptance, but unless we leave this on the
+ * front of the queue and block all other messages until someone gives us a
+ * user_ID for it, there's not a lot we can do.
+ */
+static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, int flags)
+{
+ int tmp = 0, ret;
- if (flags & (MSG_OOB | MSG_TRUNC))
- return -EOPNOTSUPP;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp);
- ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+ if (ret == 0 && !(flags & MSG_PEEK)) {
+ _debug("to be accepted");
+ write_lock_bh(&rx->recvmsg_lock);
+ list_del_init(&call->recvmsg_link);
+ write_unlock_bh(&rx->recvmsg_lock);
- timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
- msg->msg_flags |= MSG_MORE;
+ rxrpc_get_call(call, rxrpc_call_got);
+ write_lock(&rx->call_lock);
+ list_add_tail(&call->accept_link, &rx->to_be_accepted);
+ write_unlock(&rx->call_lock);
+ }
- lock_sock(&rx->sk);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret);
+ return ret;
+}
- for (;;) {
- /* return immediately if a client socket has no outstanding
- * calls */
- if (RB_EMPTY_ROOT(&rx->calls)) {
- if (copied)
- goto out;
- if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
- release_sock(&rx->sk);
- if (continue_call)
- rxrpc_put_call(continue_call);
- return -ENODATA;
- }
- }
+/*
+ * End the packet reception phase.
+ */
+static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
+{
+ _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
- /* get the next message on the Rx queue */
- skb = skb_peek(&rx->sk.sk_receive_queue);
- if (!skb) {
- /* nothing remains on the queue */
- if (copied &&
- (flags & MSG_PEEK || timeo == 0))
- goto out;
+ trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
+ ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
- /* wait for a message to turn up */
- release_sock(&rx->sk);
- prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
- TASK_INTERRUPTIBLE);
- ret = sock_error(&rx->sk);
- if (ret)
- goto wait_error;
-
- if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
- if (signal_pending(current))
- goto wait_interrupted;
- timeo = schedule_timeout(timeo);
- }
- finish_wait(sk_sleep(&rx->sk), &wait);
- lock_sock(&rx->sk);
- continue;
- }
+ if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
+ rxrpc_propose_ack_terminal_ack);
+ rxrpc_send_ack_packet(call, false);
+ }
- peek_next_packet:
- sp = rxrpc_skb(skb);
- call = sp->call;
- ASSERT(call != NULL);
+ write_lock_bh(&call->state_lock);
- _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ __rxrpc_call_completed(call);
+ write_unlock_bh(&call->state_lock);
+ break;
- /* make sure we wait for the state to be updated in this call */
- spin_lock_bh(&call->lock);
- spin_unlock_bh(&call->lock);
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ call->tx_phase = true;
+ call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+ call->ack_at = call->expire_at;
+ write_unlock_bh(&call->state_lock);
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true,
+ rxrpc_propose_ack_processing_op);
+ break;
+ default:
+ write_unlock_bh(&call->state_lock);
+ break;
+ }
+}
+
+/*
+ * Discard a packet we've used up and advance the Rx window by one.
+ */
+static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top;
+ u8 flags;
+ int ix;
+
+ _enter("%d", call->debug_id);
+
+ hard_ack = call->rx_hard_ack;
+ top = smp_load_acquire(&call->rx_top);
+ ASSERT(before(hard_ack, top));
+
+ hard_ack++;
+ ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
+ skb = call->rxtx_buffer[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
+ sp = rxrpc_skb(skb);
+ flags = sp->hdr.flags;
+ serial = sp->hdr.serial;
+ if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
+ serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
+
+ call->rxtx_buffer[ix] = NULL;
+ call->rxtx_annotations[ix] = 0;
+ /* Barrier against rxrpc_input_data(). */
+ smp_store_release(&call->rx_hard_ack, hard_ack);
+
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+
+ _debug("%u,%u,%02x", hard_ack, top, flags);
+ trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
+ if (flags & RXRPC_LAST_PACKET) {
+ rxrpc_end_rx_phase(call, serial);
+ } else {
+ /* Check to see if there's an ACK that needs sending. */
+ if (after_eq(hard_ack, call->ackr_consumed + 2) ||
+ after_eq(top, call->ackr_seen + 2) ||
+ (hard_ack == top && after(hard_ack, call->ackr_consumed)))
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial,
+ true, false,
+ rxrpc_propose_ack_rotate_rx);
+ if (call->ackr_reason)
+ rxrpc_send_ack_packet(call, false);
+ }
+}
- if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
- _debug("packet from released call");
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
- continue;
+/*
+ * Decrypt and verify a (sub)packet. The packet's length may be changed due to
+ * padding, but if this is the case, the packet length will be resident in the
+ * socket buffer. Note that we can't modify the master skb info as the skb may
+ * be the home to multiple subpackets.
+ */
+static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ u8 annotation,
+ unsigned int offset, unsigned int len)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_seq_t seq = sp->hdr.seq;
+ u16 cksum = sp->hdr.cksum;
+
+ _enter("");
+
+ /* For all but the head jumbo subpacket, the security checksum is in a
+ * jumbo header immediately prior to the data.
+ */
+ if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) {
+ __be16 tmp;
+ if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
+ BUG();
+ cksum = ntohs(tmp);
+ seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1;
+ }
+
+ return call->conn->security->verify_packet(call, skb, offset, len,
+ seq, cksum);
+}
+
+/*
+ * Locate the data within a packet. This is complicated by:
+ *
+ * (1) An skb may contain a jumbo packet - so we have to find the appropriate
+ * subpacket.
+ *
+ * (2) The (sub)packets may be encrypted and, if so, the encrypted portion
+ * contains an extra header which includes the true length of the data,
+ * excluding any encrypted padding.
+ */
+static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+ u8 *_annotation,
+ unsigned int *_offset, unsigned int *_len)
+{
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
+ unsigned int len = *_len;
+ int ret;
+ u8 annotation = *_annotation;
+
+ /* Locate the subpacket */
+ len = skb->len - offset;
+ if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
+ offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
+ RXRPC_JUMBO_SUBPKTLEN);
+ len = (annotation & RXRPC_RX_ANNO_JLAST) ?
+ skb->len - offset : RXRPC_JUMBO_SUBPKTLEN;
+ }
+
+ if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
+ ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
+ if (ret < 0)
+ return ret;
+ *_annotation |= RXRPC_RX_ANNO_VERIFIED;
+ }
+
+ *_offset = offset;
+ *_len = len;
+ call->conn->security->locate_data(call, skb, _offset, _len);
+ return 0;
+}
+
+/*
+ * Deliver messages to a call. This keeps processing packets until the buffer
+ * is filled and we find either more DATA (returns 0) or the end of the DATA
+ * (returns 1). If more packets are required, it returns -EAGAIN.
+ */
+static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
+ struct msghdr *msg, struct iov_iter *iter,
+ size_t len, int flags, size_t *_offset)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ rxrpc_seq_t hard_ack, top, seq;
+ size_t remain;
+ bool last;
+ unsigned int rx_pkt_offset, rx_pkt_len;
+ int ix, copy, ret = -EAGAIN, ret2;
+
+ rx_pkt_offset = call->rx_pkt_offset;
+ rx_pkt_len = call->rx_pkt_len;
+
+ if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
+ seq = call->rx_hard_ack;
+ ret = 1;
+ goto done;
+ }
+
+ /* Barriers against rxrpc_input_data(). */
+ hard_ack = call->rx_hard_ack;
+ top = smp_load_acquire(&call->rx_top);
+ for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ skb = call->rxtx_buffer[ix];
+ if (!skb) {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
+ break;
}
+ smp_rmb();
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ sp = rxrpc_skb(skb);
- /* determine whether to continue last data receive */
- if (continue_call) {
- _debug("maybe cont");
- if (call != continue_call ||
- skb->mark != RXRPC_SKB_MARK_DATA) {
- release_sock(&rx->sk);
- rxrpc_put_call(continue_call);
- _leave(" = %d [noncont]", copied);
- return copied;
+ if (!(flags & MSG_PEEK))
+ trace_rxrpc_receive(call, rxrpc_receive_front,
+ sp->hdr.serial, seq);
+
+ if (msg)
+ sock_recv_timestamp(msg, sock->sk, skb);
+
+ if (rx_pkt_offset == 0) {
+ ret2 = rxrpc_locate_data(call, skb,
+ &call->rxtx_annotations[ix],
+ &rx_pkt_offset, &rx_pkt_len);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
+ rx_pkt_offset, rx_pkt_len, ret2);
+ if (ret2 < 0) {
+ ret = ret2;
+ goto out;
}
+ } else {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
}
- rxrpc_get_call(call);
-
- /* copy the peer address and timestamp */
- if (!continue_call) {
- if (msg->msg_name) {
- size_t len =
- sizeof(call->conn->params.peer->srx);
- memcpy(msg->msg_name,
- &call->conn->params.peer->srx, len);
- msg->msg_namelen = len;
+ /* We have to handle short, empty and used-up DATA packets. */
+ remain = len - *_offset;
+ copy = rx_pkt_len;
+ if (copy > remain)
+ copy = remain;
+ if (copy > 0) {
+ ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+ copy);
+ if (ret2 < 0) {
+ ret = ret2;
+ goto out;
}
- sock_recv_timestamp(msg, &rx->sk, skb);
- }
- /* receive the message */
- if (skb->mark != RXRPC_SKB_MARK_DATA)
- goto receive_non_data_message;
+ /* handle piecemeal consumption of data packets */
+ rx_pkt_offset += copy;
+ rx_pkt_len -= copy;
+ *_offset += copy;
+ }
- _debug("recvmsg DATA #%u { %d, %d }",
- sp->hdr.seq, skb->len, sp->offset);
+ if (rx_pkt_len > 0) {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
+ ASSERTCMP(*_offset, ==, len);
+ ret = 0;
+ break;
+ }
- if (!continue_call) {
- /* only set the control data once per recvmsg() */
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- ullen, &call->user_call_ID);
- if (ret < 0)
- goto copy_error;
- ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ /* The whole packet has been transferred. */
+ last = sp->hdr.flags & RXRPC_LAST_PACKET;
+ if (!(flags & MSG_PEEK))
+ rxrpc_rotate_rx_window(call);
+ rx_pkt_offset = 0;
+ rx_pkt_len = 0;
+
+ if (last) {
+ ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
+ ret = 1;
+ goto out;
}
+ }
- ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
- ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
- call->rx_data_recv = sp->hdr.seq;
+out:
+ if (!(flags & MSG_PEEK)) {
+ call->rx_pkt_offset = rx_pkt_offset;
+ call->rx_pkt_len = rx_pkt_len;
+ }
+done:
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
+ rx_pkt_offset, rx_pkt_len, ret);
+ return ret;
+}
- ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+/*
+ * Receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ * simultaneously
+ */
+int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct rxrpc_call *call;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct list_head *l;
+ size_t copied = 0;
+ long timeo;
+ int ret;
- offset = sp->offset;
- copy = skb->len - offset;
- if (copy > len - copied)
- copy = len - copied;
+ DEFINE_WAIT(wait);
- ret = skb_copy_datagram_msg(skb, offset, msg, copy);
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
- if (ret < 0)
- goto copy_error;
+ if (flags & (MSG_OOB | MSG_TRUNC))
+ return -EOPNOTSUPP;
- /* handle piecemeal consumption of data packets */
- _debug("copied %d+%d", copy, copied);
+ timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
- offset += copy;
- copied += copy;
+try_again:
+ lock_sock(&rx->sk);
- if (!(flags & MSG_PEEK))
- sp->offset = offset;
+ /* Return immediately if a client socket has no outstanding calls */
+ if (RB_EMPTY_ROOT(&rx->calls) &&
+ list_empty(&rx->recvmsg_q) &&
+ rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+ release_sock(&rx->sk);
+ return -ENODATA;
+ }
- if (sp->offset < skb->len) {
- _debug("buffer full");
- ASSERTCMP(copied, ==, len);
- break;
+ if (list_empty(&rx->recvmsg_q)) {
+ ret = -EWOULDBLOCK;
+ if (timeo == 0) {
+ call = NULL;
+ goto error_no_call;
}
- /* we transferred the whole data packet */
- if (!(flags & MSG_PEEK))
- rxrpc_kernel_data_consumed(call, skb);
-
- if (sp->hdr.flags & RXRPC_LAST_PACKET) {
- _debug("last");
- if (rxrpc_conn_is_client(call->conn)) {
- /* last byte of reply received */
- ret = copied;
- goto terminal_message;
- }
-
- /* last bit of request received */
- if (!(flags & MSG_PEEK)) {
- _debug("eat packet");
- if (skb_dequeue(&rx->sk.sk_receive_queue) !=
- skb)
- BUG();
- rxrpc_free_skb(skb);
- }
- msg->msg_flags &= ~MSG_MORE;
- break;
+ release_sock(&rx->sk);
+
+ /* Wait for something to happen */
+ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+ TASK_INTERRUPTIBLE);
+ ret = sock_error(&rx->sk);
+ if (ret)
+ goto wait_error;
+
+ if (list_empty(&rx->recvmsg_q)) {
+ if (signal_pending(current))
+ goto wait_interrupted;
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
+ 0, 0, 0, 0);
+ timeo = schedule_timeout(timeo);
}
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ goto try_again;
+ }
- /* move on to the next data message */
- _debug("next");
- if (!continue_call)
- continue_call = sp->call;
- else
- rxrpc_put_call(call);
- call = NULL;
-
- if (flags & MSG_PEEK) {
- _debug("peek next");
- skb = skb->next;
- if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
- break;
- goto peek_next_packet;
- }
+ /* Find the next call and dequeue it if we're not just peeking. If we
+ * do dequeue it, that comes with a ref that we will need to release.
+ */
+ write_lock_bh(&rx->recvmsg_lock);
+ l = rx->recvmsg_q.next;
+ call = list_entry(l, struct rxrpc_call, recvmsg_link);
+ if (!(flags & MSG_PEEK))
+ list_del_init(&call->recvmsg_link);
+ else
+ rxrpc_get_call(call, rxrpc_call_got);
+ write_unlock_bh(&rx->recvmsg_lock);
+
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
+
+ if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
+ BUG();
- _debug("eat packet");
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
- }
+ if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ if (flags & MSG_CMSG_COMPAT) {
+ unsigned int id32 = call->user_call_ID;
- /* end of non-terminal data packet reception for the moment */
- _debug("end rcv data");
-out:
- release_sock(&rx->sk);
- if (call)
- rxrpc_put_call(call);
- if (continue_call)
- rxrpc_put_call(continue_call);
- _leave(" = %d [data]", copied);
- return copied;
-
- /* handle non-DATA messages such as aborts, incoming connections and
- * final ACKs */
-receive_non_data_message:
- _debug("non-data");
-
- if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
- _debug("RECV NEW CALL");
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
- if (ret < 0)
- goto copy_error;
- if (!(flags & MSG_PEEK)) {
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned int), &id32);
+ } else {
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned long),
+ &call->user_call_ID);
}
- goto out;
+ if (ret < 0)
+ goto error;
}
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- ullen, &call->user_call_ID);
- if (ret < 0)
- goto copy_error;
- ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ if (msg->msg_name) {
+ size_t len = sizeof(call->conn->params.peer->srx);
+ memcpy(msg->msg_name, &call->conn->params.peer->srx, len);
+ msg->msg_namelen = len;
+ }
- switch (skb->mark) {
- case RXRPC_SKB_MARK_DATA:
- BUG();
- case RXRPC_SKB_MARK_FINAL_ACK:
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
- break;
- case RXRPC_SKB_MARK_BUSY:
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
+ switch (call->state) {
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
break;
- case RXRPC_SKB_MARK_REMOTE_ABORT:
- abort_code = call->remote_abort;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
- break;
- case RXRPC_SKB_MARK_LOCAL_ABORT:
- abort_code = call->local_abort;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
- break;
- case RXRPC_SKB_MARK_NET_ERROR:
- _debug("RECV NET ERROR %d", sp->error);
- abort_code = sp->error;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
- break;
- case RXRPC_SKB_MARK_LOCAL_ERROR:
- _debug("RECV LOCAL ERROR %d", sp->error);
- abort_code = sp->error;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
- &abort_code);
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len,
+ flags, &copied);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ if (after(call->rx_top, call->rx_hard_ack) &&
+ call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK])
+ rxrpc_notify_socket(call);
break;
default:
- pr_err("Unknown packet mark %u\n", skb->mark);
- BUG();
+ ret = 0;
break;
}
if (ret < 0)
- goto copy_error;
-
-terminal_message:
- _debug("terminal");
- msg->msg_flags &= ~MSG_MORE;
- msg->msg_flags |= MSG_EOR;
+ goto error;
- if (!(flags & MSG_PEEK)) {
- _net("free terminal skb %p", skb);
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
- rxrpc_remove_user_ID(rx, call);
+ if (call->state == RXRPC_CALL_COMPLETE) {
+ ret = rxrpc_recvmsg_term(call, msg);
+ if (ret < 0)
+ goto error;
+ if (!(flags & MSG_PEEK))
+ rxrpc_release_call(rx, call);
+ msg->msg_flags |= MSG_EOR;
+ ret = 1;
}
- release_sock(&rx->sk);
- rxrpc_put_call(call);
- if (continue_call)
- rxrpc_put_call(continue_call);
- _leave(" = %d", ret);
- return ret;
+ if (ret == 0)
+ msg->msg_flags |= MSG_MORE;
+ else
+ msg->msg_flags &= ~MSG_MORE;
+ ret = copied;
-copy_error:
- _debug("copy error");
+error:
+ rxrpc_put_call(call, rxrpc_call_put);
+error_no_call:
release_sock(&rx->sk);
- rxrpc_put_call(call);
- if (continue_call)
- rxrpc_put_call(continue_call);
- _leave(" = %d", ret);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
return ret;
wait_interrupted:
ret = sock_intr_errno(timeo);
wait_error:
finish_wait(sk_sleep(&rx->sk), &wait);
- if (continue_call)
- rxrpc_put_call(continue_call);
- if (copied)
- copied = ret;
- _leave(" = %d [waitfail %d]", copied, ret);
- return copied;
-
+ call = NULL;
+ goto error_no_call;
}
/**
- * rxrpc_kernel_is_data_last - Determine if data message is last one
- * @skb: Message holding data
+ * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info
+ * @sock: The socket that the call exists on
+ * @call: The call to send data through
+ * @buf: The buffer to receive into
+ * @size: The size of the buffer, including data already read
+ * @_offset: The running offset into the buffer.
+ * @want_more: True if more data is expected to be read
+ * @_abort: Where the abort code is stored if -ECONNABORTED is returned
*
- * Determine if data message is last one for the parent call.
+ * Allow a kernel service to receive data and pick up information about the
+ * state of a call. Returns 0 if got what was asked for and there's more
+ * available, 1 if we got what was asked for and we're at the end of the data
+ * and -EAGAIN if we need more data.
+ *
+ * Note that we may return -EAGAIN to drain empty packets at the end of the
+ * data, even if we've already copied over the requested data.
+ *
+ * This function adds the amount it transfers to *_offset, so this should be
+ * precleared as appropriate. Note that the amount remaining in the buffer is
+ * taken to be size - *_offset.
+ *
+ * *_abort should also be initialised to 0.
*/
-bool rxrpc_kernel_is_data_last(struct sk_buff *skb)
+int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
+ void *buf, size_t size, size_t *_offset,
+ bool want_more, u32 *_abort)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct iov_iter iter;
+ struct kvec iov;
+ int ret;
- ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA);
+ _enter("{%d,%s},%zu/%zu,%d",
+ call->debug_id, rxrpc_call_states[call->state],
+ *_offset, size, want_more);
- return sp->hdr.flags & RXRPC_LAST_PACKET;
-}
+ ASSERTCMP(*_offset, <=, size);
+ ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
-EXPORT_SYMBOL(rxrpc_kernel_is_data_last);
+ iov.iov_base = buf + *_offset;
+ iov.iov_len = size - *_offset;
+ iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
-/**
- * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message
- * @skb: Message indicating an abort
- *
- * Get the abort code from an RxRPC abort message.
- */
-u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ lock_sock(sock->sk);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
+ _offset);
+ if (ret < 0)
+ goto out;
+
+ /* We can only reach here with a partially full buffer if we
+ * have reached the end of the data. We must otherwise have a
+ * full buffer or have been given -EAGAIN.
+ */
+ if (ret == 1) {
+ if (*_offset < size)
+ goto short_data;
+ if (!want_more)
+ goto read_phase_complete;
+ ret = 0;
+ goto out;
+ }
+
+ if (!want_more)
+ goto excess_data;
+ goto out;
+
+ case RXRPC_CALL_COMPLETE:
+ goto call_complete;
- switch (skb->mark) {
- case RXRPC_SKB_MARK_REMOTE_ABORT:
- return sp->call->remote_abort;
- case RXRPC_SKB_MARK_LOCAL_ABORT:
- return sp->call->local_abort;
default:
- BUG();
+ ret = -EINPROGRESS;
+ goto out;
}
-}
-
-EXPORT_SYMBOL(rxrpc_kernel_get_abort_code);
-/**
- * rxrpc_kernel_get_error - Get the error number from an RxRPC error message
- * @skb: Message indicating an error
- *
- * Get the error number from an RxRPC error message.
- */
-int rxrpc_kernel_get_error_number(struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+read_phase_complete:
+ ret = 1;
+out:
+ release_sock(sock->sk);
+ _leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
+ return ret;
- return sp->error;
+short_data:
+ ret = -EBADMSG;
+ goto out;
+excess_data:
+ ret = -EMSGSIZE;
+ goto out;
+call_complete:
+ *_abort = call->abort_code;
+ ret = -call->error;
+ if (call->completion == RXRPC_CALL_SUCCEEDED) {
+ ret = 1;
+ if (size > 0)
+ ret = -ECONNRESET;
+ }
+ goto out;
}
-
-EXPORT_SYMBOL(rxrpc_kernel_get_error_number);
+EXPORT_SYMBOL(rxrpc_kernel_recv_data);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 63afa9e9cc08..4374e7b9c7bf 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
case RXRPC_SECURITY_AUTH:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level1_hdr);
- conn->header_size += sizeof(struct rxkad_level1_hdr);
break;
case RXRPC_SECURITY_ENCRYPT:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level2_hdr);
- conn->header_size += sizeof(struct rxkad_level2_hdr);
break;
default:
ret = -EKEYREJECTED;
@@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
data_size |= (u32)check << 16;
hdr.data_size = htonl(data_size);
@@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
@@ -275,9 +273,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
/* calculate the security checksum */
- x = call->channel << (32 - RXRPC_CIDSHIFT);
+ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
x |= sp->hdr.seq & 0x3fffffff;
- call->crypto_buf[0] = htonl(sp->hdr.callNumber);
+ call->crypto_buf[0] = htonl(call->call_id);
call->crypto_buf[1] = htonl(x);
sg_init_one(&sg, call->crypto_buf, 8);
@@ -316,12 +314,11 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
/*
* decrypt partial encryption on a packet (level 1 security)
*/
-static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq)
{
struct rxkad_level1_hdr sechdr;
- struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
@@ -332,15 +329,20 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
_enter("");
- sp = rxrpc_skb(skb);
+ if (len < 8) {
+ rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
+ goto protocol_error;
+ }
- /* we want to decrypt the skbuff in-place */
+ /* Decrypt the skbuff in-place. TODO: We really want to decrypt
+ * directly into the target buffer.
+ */
nsg = skb_cow_data(skb, 0, &trailer);
if (nsg < 0 || nsg > 16)
goto nomem;
sg_init_table(sg, nsg);
- skb_to_sgvec(skb, sg, 0, 8);
+ skb_to_sgvec(skb, sg, offset, 8);
/* start the decryption afresh */
memset(&iv, 0, sizeof(iv));
@@ -351,35 +353,35 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
- /* remove the decrypted packet length */
- if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
- goto datalen_error;
- if (!skb_pull(skb, sizeof(sechdr)))
- BUG();
+ /* Extract the decrypted packet length */
+ if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
+ rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
+ offset += sizeof(sechdr);
+ len -= sizeof(sechdr);
buf = ntohl(sechdr.data_size);
data_size = buf & 0xffff;
check = buf >> 16;
- check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+ check ^= seq ^ call->call_id;
check &= 0xffff;
if (check != 0) {
- *_abort_code = RXKADSEALEDINCON;
+ rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO);
goto protocol_error;
}
- /* shorten the packet to remove the padding */
- if (data_size > skb->len)
- goto datalen_error;
- else if (data_size < skb->len)
- skb->len = data_size;
+ if (data_size > len) {
+ rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
_leave(" = 0 [dlen=%x]", data_size);
return 0;
-datalen_error:
- *_abort_code = RXKADDATALEN;
protocol_error:
+ rxrpc_send_abort_packet(call);
_leave(" = -EPROTO");
return -EPROTO;
@@ -391,13 +393,12 @@ nomem:
/*
* wholly decrypt a packet (level 2 security)
*/
-static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr sechdr;
- struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
@@ -408,9 +409,14 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
_enter(",{%d}", skb->len);
- sp = rxrpc_skb(skb);
+ if (len < 8) {
+ rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
+ goto protocol_error;
+ }
- /* we want to decrypt the skbuff in-place */
+ /* Decrypt the skbuff in-place. TODO: We really want to decrypt
+ * directly into the target buffer.
+ */
nsg = skb_cow_data(skb, 0, &trailer);
if (nsg < 0)
goto nomem;
@@ -423,7 +429,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
}
sg_init_table(sg, nsg);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ skb_to_sgvec(skb, sg, offset, len);
/* decrypt from the session key */
token = call->conn->params.key->payload.data[0];
@@ -431,41 +437,41 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
+ skcipher_request_set_crypt(req, sg, sg, len, iv.x);
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
if (sg != _sg)
kfree(sg);
- /* remove the decrypted packet length */
- if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
- goto datalen_error;
- if (!skb_pull(skb, sizeof(sechdr)))
- BUG();
+ /* Extract the decrypted packet length */
+ if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
+ rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
+ offset += sizeof(sechdr);
+ len -= sizeof(sechdr);
buf = ntohl(sechdr.data_size);
data_size = buf & 0xffff;
check = buf >> 16;
- check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+ check ^= seq ^ call->call_id;
check &= 0xffff;
if (check != 0) {
- *_abort_code = RXKADSEALEDINCON;
+ rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO);
goto protocol_error;
}
- /* shorten the packet to remove the padding */
- if (data_size > skb->len)
- goto datalen_error;
- else if (data_size < skb->len)
- skb->len = data_size;
+ if (data_size > len) {
+ rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
_leave(" = 0 [dlen=%x]", data_size);
return 0;
-datalen_error:
- *_abort_code = RXKADDATALEN;
protocol_error:
+ rxrpc_send_abort_packet(call);
_leave(" = -EPROTO");
return -EPROTO;
@@ -475,40 +481,31 @@ nomem:
}
/*
- * verify the security on a received packet
+ * Verify the security on a received packet or subpacket (if part of a
+ * jumbo packet).
*/
-static int rxkad_verify_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq, u16 expected_cksum)
{
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
- struct rxrpc_skb_priv *sp;
struct rxrpc_crypt iv;
struct scatterlist sg;
u16 cksum;
u32 x, y;
- int ret;
-
- sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u}",
- call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq);
+ call->debug_id, key_serial(call->conn->params.key), seq);
if (!call->conn->cipher)
return 0;
- if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
- *_abort_code = RXKADINCONSISTENCY;
- _leave(" = -EPROTO [not rxkad]");
- return -EPROTO;
- }
-
/* continue encrypting from where we left off */
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
/* validate the security checksum */
- x = call->channel << (32 - RXRPC_CIDSHIFT);
- x |= sp->hdr.seq & 0x3fffffff;
+ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
+ x |= seq & 0x3fffffff;
call->crypto_buf[0] = htonl(call->call_id);
call->crypto_buf[1] = htonl(x);
@@ -524,29 +521,69 @@ static int rxkad_verify_packet(struct rxrpc_call *call,
if (cksum == 0)
cksum = 1; /* zero checksums are not permitted */
- if (sp->hdr.cksum != cksum) {
- *_abort_code = RXKADSEALEDINCON;
+ if (cksum != expected_cksum) {
+ rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
+ rxrpc_send_abort_packet(call);
_leave(" = -EPROTO [csum failed]");
return -EPROTO;
}
switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
- ret = 0;
- break;
+ return 0;
case RXRPC_SECURITY_AUTH:
- ret = rxkad_verify_packet_auth(call, skb, _abort_code);
- break;
+ return rxkad_verify_packet_1(call, skb, offset, len, seq);
case RXRPC_SECURITY_ENCRYPT:
- ret = rxkad_verify_packet_encrypt(call, skb, _abort_code);
- break;
+ return rxkad_verify_packet_2(call, skb, offset, len, seq);
default:
- ret = -ENOANO;
- break;
+ return -ENOANO;
}
+}
- _leave(" = %d", ret);
- return ret;
+/*
+ * Locate the data contained in a packet that was partially encrypted.
+ */
+static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+ struct rxkad_level1_hdr sechdr;
+
+ if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+ BUG();
+ *_offset += sizeof(sechdr);
+ *_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in a packet that was completely encrypted.
+ */
+static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+ struct rxkad_level2_hdr sechdr;
+
+ if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+ BUG();
+ *_offset += sizeof(sechdr);
+ *_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in an already decrypted packet.
+ */
+static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+ switch (call->conn->params.security_level) {
+ case RXRPC_SECURITY_AUTH:
+ rxkad_locate_data_1(call, skb, _offset, _len);
+ return;
+ case RXRPC_SECURITY_ENCRYPT:
+ rxkad_locate_data_2(call, skb, _offset, _len);
+ return;
+ default:
+ return;
+ }
}
/*
@@ -716,7 +753,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
struct rxkad_challenge challenge;
struct rxkad_response resp
__attribute__((aligned(8))); /* must be aligned for crypto */
- struct rxrpc_skb_priv *sp;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
u32 version, nonce, min_level, abort_code;
int ret;
@@ -734,8 +771,8 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
}
abort_code = RXKADPACKETSHORT;
- sp = rxrpc_skb(skb);
- if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &challenge, sizeof(challenge)) < 0)
goto protocol_error;
version = ntohl(challenge.version);
@@ -981,7 +1018,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
{
struct rxkad_response response
__attribute__((aligned(8))); /* must be aligned for crypto */
- struct rxrpc_skb_priv *sp;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt session_key;
time_t expiry;
void *ticket;
@@ -992,7 +1029,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &response, sizeof(response)) < 0)
goto protocol_error;
if (!pskb_pull(skb, sizeof(response)))
BUG();
@@ -1000,7 +1038,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
version = ntohl(response.version);
ticket_len = ntohl(response.ticket_len);
kvno = ntohl(response.kvno);
- sp = rxrpc_skb(skb);
_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
sp->hdr.serial, version, kvno, ticket_len);
@@ -1022,7 +1059,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
return -ENOMEM;
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ ticket, ticket_len) < 0)
goto protocol_error_free;
ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
@@ -1147,6 +1185,7 @@ const struct rxrpc_security rxkad = {
.prime_packet_security = rxkad_prime_packet_security,
.secure_packet = rxkad_secure_packet,
.verify_packet = rxkad_verify_packet,
+ .locate_data = rxkad_locate_data,
.issue_challenge = rxkad_issue_challenge,
.respond_to_challenge = rxkad_respond_to_challenge,
.verify_response = rxkad_verify_response,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 814d285ff802..7d921e56e715 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
}
/* find the service */
- read_lock_bh(&local->services_lock);
- list_for_each_entry(rx, &local->services, listen_link) {
- if (rx->srx.srx_service == conn->params.service_id)
- goto found_service;
- }
+ read_lock(&local->services_lock);
+ rx = rcu_dereference_protected(local->service,
+ lockdep_is_held(&local->services_lock));
+ if (rx && rx->srx.srx_service == conn->params.service_id)
+ goto found_service;
/* the service appears to have died */
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
_leave(" = -ENOENT");
return -ENOENT;
found_service:
if (!rx->securities) {
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
_leave(" = -ENOKEY");
return -ENOKEY;
}
@@ -152,13 +152,13 @@ found_service:
kref = keyring_search(make_key_ref(rx->securities, 1UL),
&key_type_rxrpc_s, kdesc);
if (IS_ERR(kref)) {
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
_leave(" = %ld [search]", PTR_ERR(kref));
return PTR_ERR(kref);
}
key = key_ref_to_ptr(kref);
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
conn->server_key = key;
conn->security = sec;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
new file mode 100644
index 000000000000..b214a4d4a641
--- /dev/null
+++ b/net/rxrpc/sendmsg.c
@@ -0,0 +1,610 @@
+/* AF_RXRPC sendmsg() implementation.
+ *
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+enum rxrpc_command {
+ RXRPC_CMD_SEND_DATA, /* send data message */
+ RXRPC_CMD_SEND_ABORT, /* request abort generation */
+ RXRPC_CMD_ACCEPT, /* [server] accept incoming call */
+ RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */
+};
+
+/*
+ * wait for space to appear in the transmit/ACK window
+ * - caller holds the socket locked
+ */
+static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ long *timeo)
+{
+ DECLARE_WAITQUEUE(myself, current);
+ int ret;
+
+ _enter(",{%u,%u,%u}",
+ call->tx_hard_ack, call->tx_top, call->tx_winsize);
+
+ add_wait_queue(&call->waitq, &myself);
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = 0;
+ if (call->tx_top - call->tx_hard_ack <
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra))
+ break;
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ ret = -call->error;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(*timeo);
+ break;
+ }
+
+ trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ release_sock(&rx->sk);
+ *timeo = schedule_timeout(*timeo);
+ lock_sock(&rx->sk);
+ }
+
+ remove_wait_queue(&call->waitq, &myself);
+ set_current_state(TASK_RUNNING);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Schedule an instant Tx resend.
+ */
+static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
+{
+ spin_lock_bh(&call->lock);
+
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+ if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ rxrpc_queue_call(call);
+ }
+
+ spin_unlock_bh(&call->lock);
+}
+
+/*
+ * Queue a DATA packet for transmission, set the resend timeout and send the
+ * packet immediately
+ */
+static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ bool last)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_seq_t seq = sp->hdr.seq;
+ int ret, ix;
+ u8 annotation = RXRPC_TX_ANNO_UNACK;
+
+ _net("queue skb %p [%d]", skb, seq);
+
+ ASSERTCMP(seq, ==, call->tx_top + 1);
+
+ if (last)
+ annotation |= RXRPC_TX_ANNO_LAST;
+
+ /* We have to set the timestamp before queueing as the retransmit
+ * algorithm can see the packet as soon as we queue it.
+ */
+ skb->tstamp = ktime_get_real();
+
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ call->rxtx_annotations[ix] = annotation;
+ smp_wmb();
+ call->rxtx_buffer[ix] = skb;
+ call->tx_top = seq;
+ if (last)
+ trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+ else
+ trace_rxrpc_transmit(call, rxrpc_transmit_queue);
+
+ if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
+ _debug("________awaiting reply/ACK__________");
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+ break;
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ call->state = RXRPC_CALL_SERVER_SEND_REPLY;
+ call->ack_at = call->expire_at;
+ if (call->ackr_reason == RXRPC_ACK_DELAY)
+ call->ackr_reason = 0;
+ __rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply,
+ ktime_get_real());
+ if (!last)
+ break;
+ case RXRPC_CALL_SERVER_SEND_REPLY:
+ call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
+ break;
+ default:
+ break;
+ }
+ write_unlock_bh(&call->state_lock);
+ }
+
+ if (seq == 1 && rxrpc_is_client_call(call))
+ rxrpc_expose_client_call(call);
+
+ ret = rxrpc_send_data_packet(call, skb, false);
+ if (ret < 0) {
+ _debug("need instant resend %d", ret);
+ rxrpc_instant_resend(call, ix);
+ } else {
+ ktime_t now = ktime_get_real(), resend_at;
+
+ resend_at = ktime_add_ms(now, rxrpc_resend_timeout);
+
+ if (ktime_before(resend_at, call->resend_at)) {
+ call->resend_at = resend_at;
+ rxrpc_set_timer(call, rxrpc_timer_set_for_send, now);
+ }
+ }
+
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ _leave("");
+}
+
+/*
+ * send data through a socket
+ * - must be called in process context
+ * - caller holds the socket locked
+ */
+static int rxrpc_send_data(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, size_t len)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ struct sock *sk = &rx->sk;
+ long timeo;
+ bool more;
+ int ret, copied;
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ /* this should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ return -EPIPE;
+
+ more = msg->msg_flags & MSG_MORE;
+
+ skb = call->tx_pending;
+ call->tx_pending = NULL;
+ rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
+
+ copied = 0;
+ do {
+ /* Check to see if there's a ping ACK to reply to. */
+ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
+ rxrpc_send_ack_packet(call, false);
+
+ if (!skb) {
+ size_t size, chunk, max, space;
+
+ _debug("alloc");
+
+ if (call->tx_top - call->tx_hard_ack >=
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra)) {
+ ret = -EAGAIN;
+ if (msg->msg_flags & MSG_DONTWAIT)
+ goto maybe_error;
+ ret = rxrpc_wait_for_tx_window(rx, call,
+ &timeo);
+ if (ret < 0)
+ goto maybe_error;
+ }
+
+ max = RXRPC_JUMBO_DATALEN;
+ max -= call->conn->security_size;
+ max &= ~(call->conn->size_align - 1UL);
+
+ chunk = max;
+ if (chunk > msg_data_left(msg) && !more)
+ chunk = msg_data_left(msg);
+
+ space = chunk + call->conn->size_align;
+ space &= ~(call->conn->size_align - 1UL);
+
+ size = space + call->conn->security_size;
+
+ _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
+
+ /* create a buffer that we can retain until it's ACK'd */
+ skb = sock_alloc_send_skb(
+ sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
+ if (!skb)
+ goto maybe_error;
+
+ rxrpc_new_skb(skb, rxrpc_skb_tx_new);
+
+ _debug("ALLOC SEND %p", skb);
+
+ ASSERTCMP(skb->mark, ==, 0);
+
+ _debug("HS: %u", call->conn->security_size);
+ skb_reserve(skb, call->conn->security_size);
+ skb->len += call->conn->security_size;
+
+ sp = rxrpc_skb(skb);
+ sp->remain = chunk;
+ if (sp->remain > skb_tailroom(skb))
+ sp->remain = skb_tailroom(skb);
+
+ _net("skb: hr %d, tr %d, hl %d, rm %d",
+ skb_headroom(skb),
+ skb_tailroom(skb),
+ skb_headlen(skb),
+ sp->remain);
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ _debug("append");
+ sp = rxrpc_skb(skb);
+
+ /* append next segment of data to the current buffer */
+ if (msg_data_left(msg) > 0) {
+ int copy = skb_tailroom(skb);
+ ASSERTCMP(copy, >, 0);
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
+ if (copy > sp->remain)
+ copy = sp->remain;
+
+ _debug("add");
+ ret = skb_add_data(skb, &msg->msg_iter, copy);
+ _debug("added");
+ if (ret < 0)
+ goto efault;
+ sp->remain -= copy;
+ skb->mark += copy;
+ copied += copy;
+ }
+
+ /* check for the far side aborting the call or a network error
+ * occurring */
+ if (call->state == RXRPC_CALL_COMPLETE)
+ goto call_terminated;
+
+ /* add the packet to the send queue if it's now full */
+ if (sp->remain <= 0 ||
+ (msg_data_left(msg) == 0 && !more)) {
+ struct rxrpc_connection *conn = call->conn;
+ uint32_t seq;
+ size_t pad;
+
+ /* pad out if we're using security */
+ if (conn->security_ix) {
+ pad = conn->security_size + skb->mark;
+ pad = conn->size_align - pad;
+ pad &= conn->size_align - 1;
+ _debug("pad %zu", pad);
+ if (pad)
+ memset(skb_put(skb, pad), 0, pad);
+ }
+
+ seq = call->tx_top + 1;
+
+ sp->hdr.seq = seq;
+ sp->hdr._rsvd = 0;
+ sp->hdr.flags = conn->out_clientflag;
+
+ if (msg_data_left(msg) == 0 && !more)
+ sp->hdr.flags |= RXRPC_LAST_PACKET;
+ else if (call->tx_top - call->tx_hard_ack <
+ call->tx_winsize)
+ sp->hdr.flags |= RXRPC_MORE_PACKETS;
+
+ ret = conn->security->secure_packet(
+ call, skb, skb->mark, skb->head);
+ if (ret < 0)
+ goto out;
+
+ rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
+ skb = NULL;
+ }
+ } while (msg_data_left(msg) > 0);
+
+success:
+ ret = copied;
+out:
+ call->tx_pending = skb;
+ _leave(" = %d", ret);
+ return ret;
+
+call_terminated:
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ _leave(" = %d", -call->error);
+ return -call->error;
+
+maybe_error:
+ if (copied)
+ goto success;
+ goto out;
+
+efault:
+ ret = -EFAULT;
+ goto out;
+}
+
+/*
+ * extract control messages from the sendmsg() control buffer
+ */
+static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
+ unsigned long *user_call_ID,
+ enum rxrpc_command *command,
+ u32 *abort_code,
+ bool *_exclusive)
+{
+ struct cmsghdr *cmsg;
+ bool got_user_ID = false;
+ int len;
+
+ *command = RXRPC_CMD_SEND_DATA;
+
+ if (msg->msg_controllen == 0)
+ return -EINVAL;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ _debug("CMSG %d, %d, %d",
+ cmsg->cmsg_level, cmsg->cmsg_type, len);
+
+ if (cmsg->cmsg_level != SOL_RXRPC)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RXRPC_USER_CALL_ID:
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
+ if (len != sizeof(u32))
+ return -EINVAL;
+ *user_call_ID = *(u32 *) CMSG_DATA(cmsg);
+ } else {
+ if (len != sizeof(unsigned long))
+ return -EINVAL;
+ *user_call_ID = *(unsigned long *)
+ CMSG_DATA(cmsg);
+ }
+ _debug("User Call ID %lx", *user_call_ID);
+ got_user_ID = true;
+ break;
+
+ case RXRPC_ABORT:
+ if (*command != RXRPC_CMD_SEND_DATA)
+ return -EINVAL;
+ *command = RXRPC_CMD_SEND_ABORT;
+ if (len != sizeof(*abort_code))
+ return -EINVAL;
+ *abort_code = *(unsigned int *) CMSG_DATA(cmsg);
+ _debug("Abort %x", *abort_code);
+ if (*abort_code == 0)
+ return -EINVAL;
+ break;
+
+ case RXRPC_ACCEPT:
+ if (*command != RXRPC_CMD_SEND_DATA)
+ return -EINVAL;
+ *command = RXRPC_CMD_ACCEPT;
+ if (len != 0)
+ return -EINVAL;
+ break;
+
+ case RXRPC_EXCLUSIVE_CALL:
+ *_exclusive = true;
+ if (len != 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (!got_user_ID)
+ return -EINVAL;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Create a new client call for sendmsg().
+ */
+static struct rxrpc_call *
+rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+ unsigned long user_call_ID, bool exclusive)
+{
+ struct rxrpc_conn_parameters cp;
+ struct rxrpc_call *call;
+ struct key *key;
+
+ DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name);
+
+ _enter("");
+
+ if (!msg->msg_name)
+ return ERR_PTR(-EDESTADDRREQ);
+
+ key = rx->key;
+ if (key && !rx->key->payload.data[0])
+ key = NULL;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
+ cp.key = rx->key;
+ cp.security_level = rx->min_sec_level;
+ cp.exclusive = rx->exclusive | exclusive;
+ cp.service_id = srx->srx_service;
+ call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
+
+ _leave(" = %p\n", call);
+ return call;
+}
+
+/*
+ * send a message forming part of a client call through an RxRPC socket
+ * - caller holds the socket locked
+ * - the socket may be either a client socket or a server socket
+ */
+int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+{
+ enum rxrpc_command cmd;
+ struct rxrpc_call *call;
+ unsigned long user_call_ID = 0;
+ bool exclusive = false;
+ u32 abort_code = 0;
+ int ret;
+
+ _enter("");
+
+ ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
+ &exclusive);
+ if (ret < 0)
+ return ret;
+
+ if (cmd == RXRPC_CMD_ACCEPT) {
+ if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
+ return -EINVAL;
+ call = rxrpc_accept_call(rx, user_call_ID, NULL);
+ if (IS_ERR(call))
+ return PTR_ERR(call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ return 0;
+ }
+
+ call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
+ if (!call) {
+ if (cmd != RXRPC_CMD_SEND_DATA)
+ return -EBADSLT;
+ call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
+ exclusive);
+ if (IS_ERR(call))
+ return PTR_ERR(call);
+ }
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ /* it's too late for this call */
+ ret = -ESHUTDOWN;
+ } else if (cmd == RXRPC_CMD_SEND_ABORT) {
+ ret = 0;
+ if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED))
+ ret = rxrpc_send_abort_packet(call);
+ } else if (cmd != RXRPC_CMD_SEND_DATA) {
+ ret = -EINVAL;
+ } else if (rxrpc_is_client_call(call) &&
+ call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
+ /* request phase complete for this client call */
+ ret = -EPROTO;
+ } else if (rxrpc_is_service_call(call) &&
+ call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ /* Reply phase not begun or not complete for service call. */
+ ret = -EPROTO;
+ } else {
+ ret = rxrpc_send_data(rx, call, msg, len);
+ }
+
+ rxrpc_put_call(call, rxrpc_call_put);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_send_data - Allow a kernel service to send data on a call
+ * @sock: The socket the call is on
+ * @call: The call to send data through
+ * @msg: The data to send
+ * @len: The amount of data to send
+ *
+ * Allow a kernel service to send data on a call. The call must be in an state
+ * appropriate to sending data. No control data should be supplied in @msg,
+ * nor should an address be supplied. MSG_MORE should be flagged if there's
+ * more data to come, otherwise this data will end the transmission phase.
+ */
+int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
+ struct msghdr *msg, size_t len)
+{
+ int ret;
+
+ _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]);
+
+ ASSERTCMP(msg->msg_name, ==, NULL);
+ ASSERTCMP(msg->msg_control, ==, NULL);
+
+ lock_sock(sock->sk);
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ ret = -ESHUTDOWN; /* it's too late for this call */
+ } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ ret = -EPROTO; /* request phase complete for this client call */
+ } else {
+ ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len);
+ }
+
+ release_sock(sock->sk);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(rxrpc_kernel_send_data);
+
+/**
+ * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
+ * @sock: The socket the call is on
+ * @call: The call to be aborted
+ * @abort_code: The abort code to stick into the ABORT packet
+ * @error: Local error value
+ * @why: 3-char string indicating why.
+ *
+ * Allow a kernel service to abort a call, if it's still in an abortable state.
+ */
+void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
+ u32 abort_code, int error, const char *why)
+{
+ _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
+
+ lock_sock(sock->sk);
+
+ if (rxrpc_abort_call(why, call, 0, abort_code, error))
+ rxrpc_send_abort_packet(call);
+
+ release_sock(sock->sk);
+ _leave("");
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_abort_call);
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 06c51d4b622d..67b02c45271b 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,148 +18,82 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
+#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
+
/*
- * set up for the ACK at the end of the receive phase when we discard the final
- * receive phase data packet
- * - called with softirqs disabled
+ * Note the allocation or reception of a socket buffer.
*/
-static void rxrpc_request_final_ACK(struct rxrpc_call *call)
+void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
- /* the call may be aborted before we have a chance to ACK it */
- write_lock(&call->state_lock);
-
- switch (call->state) {
- case RXRPC_CALL_CLIENT_RECV_REPLY:
- call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
- _debug("request final ACK");
-
- /* get an extra ref on the call for the final-ACK generator to
- * release */
- rxrpc_get_call(call);
- set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
- if (try_to_del_timer_sync(&call->ack_timer) >= 0)
- rxrpc_queue_call(call);
- break;
-
- case RXRPC_CALL_SERVER_RECV_REQUEST:
- call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
- default:
- break;
- }
-
- write_unlock(&call->state_lock);
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
}
/*
- * drop the bottom ACK off of the call ACK window and advance the window
+ * Note the re-emergence of a socket buffer from a queue or buffer.
*/
-static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
- struct rxrpc_skb_priv *sp)
+void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
- int loop;
- u32 seq;
-
- spin_lock_bh(&call->lock);
-
- _debug("hard ACK #%u", sp->hdr.seq);
-
- for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
- call->ackr_window[loop] >>= 1;
- call->ackr_window[loop] |=
- call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
- }
-
- seq = sp->hdr.seq;
- ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
- call->rx_data_eaten = seq;
-
- if (call->ackr_win_top < UINT_MAX)
- call->ackr_win_top++;
-
- ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
- call->rx_data_post, >=, call->rx_data_recv);
- ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
- call->rx_data_recv, >=, call->rx_data_eaten);
-
- if (sp->hdr.flags & RXRPC_LAST_PACKET) {
- rxrpc_request_final_ACK(call);
- } else if (atomic_dec_and_test(&call->ackr_not_idle) &&
- test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
- /* We previously soft-ACK'd some received packets that have now
- * been consumed, so send a hard-ACK if no more packets are
- * immediately forthcoming to allow the transmitter to free up
- * its Tx bufferage.
- */
- _debug("send Rx idle ACK");
- __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial,
- false);
+ const void *here = __builtin_return_address(0);
+ if (skb) {
+ int n = atomic_read(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
}
-
- spin_unlock_bh(&call->lock);
}
-/**
- * rxrpc_kernel_data_consumed - Record consumption of data message
- * @call: The call to which the message pertains.
- * @skb: Message holding data
- *
- * Record the consumption of a data message and generate an ACK if appropriate.
- * The call state is shifted if this was the final packet. The caller must be
- * in process context with no spinlocks held.
- *
- * TODO: Actually generate the ACK here rather than punting this to the
- * workqueue.
+/*
+ * Note the addition of a ref on a socket buffer.
*/
-void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
+void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
-
- ASSERTCMP(sp->call, ==, call);
- ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
-
- /* TODO: Fix the sequence number tracking */
- ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
- ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
- ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
-
- call->rx_data_recv = sp->hdr.seq;
- rxrpc_hard_ACK_data(call, sp);
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+ skb_get(skb);
}
-EXPORT_SYMBOL(rxrpc_kernel_data_consumed);
/*
- * Destroy a packet that has an RxRPC control buffer
+ * Note the destruction of a socket buffer.
*/
-void rxrpc_packet_destructor(struct sk_buff *skb)
+void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_call *call = sp->call;
-
- _enter("%p{%p}", skb, call);
-
- if (call) {
- if (atomic_dec_return(&call->skb_count) < 0)
- BUG();
- rxrpc_put_call(call);
- sp->call = NULL;
+ const void *here = __builtin_return_address(0);
+ if (skb) {
+ int n;
+ CHECK_SLAB_OKAY(&skb->users);
+ n = atomic_dec_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+ kfree_skb(skb);
}
+}
- if (skb->sk)
- sock_rfree(skb);
- _leave("");
+/*
+ * Note the injected loss of a socket buffer.
+ */
+void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+{
+ const void *here = __builtin_return_address(0);
+ if (skb) {
+ int n;
+ CHECK_SLAB_OKAY(&skb->users);
+ n = atomic_dec_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+ kfree_skb(skb);
+ }
}
-/**
- * rxrpc_kernel_free_skb - Free an RxRPC socket buffer
- * @skb: The socket buffer to be freed
- *
- * Let RxRPC free its own socket buffer, permitting it to maintain debug
- * accounting.
+/*
+ * Clear a queue of socket buffers.
*/
-void rxrpc_kernel_free_skb(struct sk_buff *skb)
+void rxrpc_purge_queue(struct sk_buff_head *list)
{
- rxrpc_free_skb(skb);
+ const void *here = __builtin_return_address(0);
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue((list))) != NULL) {
+ int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
+ trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
+ atomic_read(&skb->users), n, here);
+ kfree_skb(skb);
+ }
}
-EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 03ad08774d4e..34c706d2f79c 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -20,7 +20,7 @@ static const unsigned int one = 1;
static const unsigned int four = 4;
static const unsigned int thirtytwo = 32;
static const unsigned int n_65535 = 65535;
-static const unsigned int n_max_acks = RXRPC_MAXACKS;
+static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
/*
* RxRPC operating parameters.
@@ -35,7 +35,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_requested_ack_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&zero,
},
{
@@ -43,7 +43,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_soft_ack_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
{
@@ -51,7 +51,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_idle_ack_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
{
@@ -59,6 +59,22 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_resend_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = (void *)&one,
+ },
+ {
+ .procname = "idle_conn_expiry",
+ .data = &rxrpc_conn_idle_client_expiry,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ .extra1 = (void *)&one,
+ },
+ {
+ .procname = "idle_conn_fast_expiry",
+ .data = &rxrpc_conn_idle_client_fast_expiry,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
.extra1 = (void *)&one,
},
@@ -69,29 +85,28 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_max_call_lifetime,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
+
+ /* Non-time values */
{
- .procname = "dead_call_expiry",
- .data = &rxrpc_dead_call_expiry,
+ .procname = "max_client_conns",
+ .data = &rxrpc_max_client_connections,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- .extra1 = (void *)&one,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&rxrpc_reap_client_connections,
},
-
- /* Values measured in seconds */
{
- .procname = "connection_expiry",
- .data = &rxrpc_connection_expiry,
+ .procname = "reap_client_conns",
+ .data = &rxrpc_reap_client_connections,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
+ .extra2 = (void *)&rxrpc_max_client_connections,
},
-
- /* Non-time values */
{
.procname = "max_backlog",
.data = &rxrpc_max_backlog,
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
index b88914d53ca5..ff7af71c4b49 100644
--- a/net/rxrpc/utils.c
+++ b/net/rxrpc/utils.c
@@ -30,6 +30,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
return 0;
+#ifdef CONFIG_AF_RXRPC_IPV6
case ETH_P_IPV6:
srx->transport_type = SOCK_DGRAM;
srx->transport_len = sizeof(srx->transport.sin6);
@@ -37,6 +38,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
srx->transport.sin6.sin6_port = udp_hdr(skb)->source;
srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
return 0;
+#endif
default:
pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n",
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..87956a768d1b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_SKBMOD
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow modification of skb data
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
+
config NET_ACT_IFE
tristate "Inter-FE action based on IETF ForCES InterFE LFB"
depends on NET_CLS_ACT
@@ -761,6 +772,17 @@ config NET_ACT_IFE
To compile this code as a module, choose M here: the
module will be called act_ife.
+config NET_ACT_TUNNEL_KEY
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to set/release ip tunnel metadata.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_tunnel_key.
+
config NET_IFE_SKBMARK
tristate "Support to encoding decoding skb mark on IFE action"
depends on NET_ACT_IFE
@@ -771,6 +793,11 @@ config NET_IFE_SKBPRIO
depends on NET_ACT_IFE
---help---
+config NET_IFE_SKBTCINDEX
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+ ---help---
+
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..4bdda3634e0b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,9 +19,12 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d0687594b..e10456ef6f7a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
- gen_kill_estimator(&p->tcfa_bstats,
- &p->tcfa_rate_est);
+ gen_kill_estimator(&p->tcfa_rate_est);
/*
* gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
- gen_kill_estimator(&a->tcfa_bstats,
- &a->tcfa_rate_est);
+ gen_kill_estimator(&a->tcfa_rate_est);
call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -341,22 +339,25 @@ int tcf_register_action(struct tc_action_ops *act,
if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
return -EINVAL;
+ /* We have to register pernet ops before making the action ops visible,
+ * otherwise tcf_action_init_1() could get a partially initialized
+ * netns.
+ */
+ ret = register_pernet_subsys(ops);
+ if (ret)
+ return ret;
+
write_lock(&act_mod_lock);
list_for_each_entry(a, &act_base, head) {
if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
write_unlock(&act_mod_lock);
+ unregister_pernet_subsys(ops);
return -EEXIST;
}
}
list_add_tail(&act->head, &act_base);
write_unlock(&act_mod_lock);
- ret = register_pernet_subsys(ops);
- if (ret) {
- tcf_unregister_action(act, ops);
- return ret;
- }
-
return 0;
}
EXPORT_SYMBOL(tcf_register_action);
@@ -367,8 +368,6 @@ int tcf_unregister_action(struct tc_action_ops *act,
struct tc_action_ops *a;
int err = -ENOENT;
- unregister_pernet_subsys(ops);
-
write_lock(&act_mod_lock);
list_for_each_entry(a, &act_base, head) {
if (a == act) {
@@ -378,6 +377,8 @@ int tcf_unregister_action(struct tc_action_ops *act,
}
}
write_unlock(&act_mod_lock);
+ if (!err)
+ unregister_pernet_subsys(ops);
return err;
}
EXPORT_SYMBOL(tcf_unregister_action);
@@ -592,9 +593,19 @@ err_out:
return ERR_PTR(err);
}
-int tcf_action_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, char *name, int ovr,
- int bind, struct list_head *actions)
+static void cleanup_a(struct list_head *actions, int ovr)
+{
+ struct tc_action *a;
+
+ if (!ovr)
+ return;
+
+ list_for_each_entry(a, actions, list)
+ a->tcfa_refcnt--;
+}
+
+int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+ char *name, int ovr, int bind, struct list_head *actions)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
@@ -612,8 +623,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla,
goto err;
}
act->order = i;
+ if (ovr)
+ act->tcfa_refcnt++;
list_add_tail(&act->list, actions);
}
+
+ /* Remove the temp refcnt which was necessary to protect against
+ * destroying an existing action which was being replaced
+ */
+ cleanup_a(actions, ovr);
return 0;
err:
@@ -650,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
- &p->tcfa_rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
p->tcfa_qstats.qlen) < 0)
@@ -895,7 +912,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
return ret;
}
err:
- tcf_action_destroy(&actions, 0);
+ if (event != RTM_GETACTION)
+ tcf_action_destroy(&actions, 0);
return ret;
}
@@ -923,9 +941,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
return err;
}
-static int
-tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
- u32 portid, int ovr)
+static int tcf_action_add(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid, int ovr)
{
int ret = 0;
LIST_HEAD(actions);
@@ -988,8 +1005,7 @@ replay:
return ret;
}
-static struct nlattr *
-find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
{
struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1008,16 +1024,14 @@ find_dump_kind(const struct nlmsghdr *n)
if (tb[1] == NULL)
return NULL;
- if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]),
- nla_len(tb[1]), NULL) < 0)
+ if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL) < 0)
return NULL;
kind = tb2[TCA_ACT_KIND];
return kind;
}
-static int
-tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nlmsghdr *nlh;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index bfa870731e74..520baa41cba3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -28,24 +28,20 @@ struct tcf_bpf_cfg {
struct bpf_prog *filter;
struct sock_filter *bpf_ops;
const char *bpf_name;
- u32 bpf_fd;
u16 bpf_num_ops;
bool is_ebpf;
};
-static int bpf_net_id;
+static unsigned int bpf_net_id;
static struct tc_action_ops act_bpf_ops;
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
struct tcf_result *res)
{
+ bool at_ingress = skb_at_tc_ingress(skb);
struct tcf_bpf *prog = to_bpf(act);
struct bpf_prog *filter;
int action, filter_res;
- bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
-
- if (unlikely(!skb_mac_header_was_set(skb)))
- return TC_ACT_UNSPEC;
tcf_lastuse_update(&prog->tcf_tm);
bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
@@ -121,13 +117,18 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
struct sk_buff *skb)
{
- if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd))
- return -EMSGSIZE;
+ struct nlattr *nla;
if (prog->bpf_name &&
nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
return -EMSGSIZE;
+ nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
+
return 0;
}
@@ -229,16 +230,13 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
return PTR_ERR(fp);
if (tb[TCA_ACT_BPF_NAME]) {
- name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
- nla_len(tb[TCA_ACT_BPF_NAME]),
- GFP_KERNEL);
+ name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL);
if (!name) {
bpf_prog_put(fp);
return -ENOMEM;
}
}
- cfg->bpf_fd = bpf_fd;
cfg->bpf_name = name;
cfg->filter = fp;
cfg->is_ebpf = true;
@@ -337,8 +335,6 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (cfg.bpf_num_ops)
prog->bpf_num_ops = cfg.bpf_num_ops;
- if (cfg.bpf_fd)
- prog->bpf_fd = cfg.bpf_fd;
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index eae07a2e774d..ab8062909962 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,7 +30,7 @@
#define CONNMARK_TAB_MASK 3
-static int connmark_net_id;
+static unsigned int connmark_net_id;
static struct tc_action_ops act_connmark_ops;
static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b5dbf633a863..a0edd80a44db 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -42,7 +42,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
};
-static int csum_net_id;
+static unsigned int csum_net_id;
static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
@@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
return (void *)(skb_network_header(skb) + ihl);
}
-static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct icmphdr *icmph;
@@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct icmp6hdr *icmp6h;
const struct ipv6hdr *ip6h;
@@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct tcphdr *tcph;
const struct iphdr *iph;
@@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct tcphdr *tcph;
const struct ipv6hdr *ip6h;
@@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv4_udp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct iphdr *iph;
@@ -270,8 +270,8 @@ ignore_obscure_skb:
return 1;
}
-static int tcf_csum_ipv6_udp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct ipv6hdr *ip6h;
@@ -380,8 +380,8 @@ fail:
return 0;
}
-static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
- unsigned int ixhl, unsigned int *pl)
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl,
+ unsigned int *pl)
{
int off, len, optlen;
unsigned char *xh = (void *)ip6xh;
@@ -494,8 +494,8 @@ fail:
return 0;
}
-static int tcf_csum(struct sk_buff *skb,
- const struct tc_action *a, struct tcf_result *res)
+static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_csum *p = to_tcf_csum(a);
int action;
@@ -531,8 +531,8 @@ drop:
return TC_ACT_SHOT;
}
-static int tcf_csum_dump(struct sk_buff *skb,
- struct tc_action *a, int bind, int ref)
+static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_csum *p = to_tcf_csum(a);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e24a4093d6f6..e6c874a2b283 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,7 +25,7 @@
#define GACT_TAB_MASK 15
-static int gact_net_id;
+static unsigned int gact_net_id;
static struct tc_action_ops act_gact_ops;
#ifdef CONFIG_GACT_PROB
@@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets);
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
+ packets);
if (action == TC_ACT_SHOT)
this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 4a60cd5e1875..80b848d3f096 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -35,7 +35,7 @@
#define IFE_TAB_MASK 15
-static int ife_net_id;
+static unsigned int ife_net_id;
static int max_metacnt = IFE_META_MAX + 1;
static struct tc_action_ops act_ife_ops;
@@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
}
EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+ u16 edata = 0;
+
+ if (mi->metaval)
+ edata = *(u16 *)mi->metaval;
+ else if (metaval)
+ edata = metaval;
+
+ if (!edata) /* will not encode */
+ return 0;
+
+ edata = htons(edata);
+ return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
+
int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
{
if (mi->metaval)
@@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
}
EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
+{
+ if (metaval || mi->metaval)
+ return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u16);
+
int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
{
u32 edata = metaval;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 378c1c976058..992ef8d624f1 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,10 +30,10 @@
#define IPT_TAB_MASK 15
-static int ipt_net_id;
+static unsigned int ipt_net_id;
static struct tc_action_ops act_ipt_ops;
-static int xt_net_id;
+static unsigned int xt_net_id;
static struct tc_action_ops act_xt_ops;
static int ipt_init_target(struct xt_entry_target *t, char *table,
@@ -213,6 +213,12 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
int ret = 0, result = 0;
struct tcf_ipt *ipt = to_ipt(a);
struct xt_action_param par;
+ struct nf_hook_state state = {
+ .net = dev_net(skb->dev),
+ .in = skb->dev,
+ .hook = ipt->tcfi_hook,
+ .pf = NFPROTO_IPV4,
+ };
if (skb_unclone(skb, GFP_ATOMIC))
return TC_ACT_UNSPEC;
@@ -226,13 +232,9 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
* worry later - danger - this API seems to have changed
* from earlier kernels
*/
- par.net = dev_net(skb->dev);
- par.in = skb->dev;
- par.out = NULL;
- par.hooknum = ipt->tcfi_hook;
+ par.state = &state;
par.target = ipt->tcfi_t->u.kernel.target;
par.targinfo = ipt->tcfi_t->data;
- par.family = NFPROTO_IPV4;
ret = par.target->target(skb, &par);
switch (ret) {
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000000..3b35774ce890
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifetc_index = skb->tc_index;
+
+ return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u16 ifetc_index = *(u16 *)data;
+
+ skb->tc_index = ntohs(ifetc_index);
+ return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+ .metaid = IFE_META_TCINDEX,
+ .metatype = NLA_U16,
+ .name = "tc_index",
+ .synopsis = "skb tc_index 16 bit metadata",
+ .check_presence = skbtcindex_check,
+ .encode = skbtcindex_encode,
+ .decode = skbtcindex_decode,
+ .get = ife_get_meta_u16,
+ .alloc = ife_alloc_meta_u16,
+ .release = ife_release_meta_gen,
+ .validate = ife_validate_meta_u16,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+ return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+ unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6038c85d92f5..2d9fa6e0a1b4 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
+#include <linux/if_arp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -33,6 +34,25 @@
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
+static bool tcf_mirred_is_act_redirect(int action)
+{
+ return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
+}
+
+static u32 tcf_mirred_act_direction(int action)
+{
+ switch (action) {
+ case TCA_EGRESS_REDIR:
+ case TCA_EGRESS_MIRROR:
+ return AT_EGRESS;
+ case TCA_INGRESS_REDIR:
+ case TCA_INGRESS_MIRROR:
+ return AT_INGRESS;
+ default:
+ BUG();
+ }
+}
+
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
@@ -51,7 +71,7 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
};
-static int mirred_net_id;
+static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
@@ -60,11 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
+ bool mac_header_xmit = false;
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
- int ret, ok_push = 0;
bool exists = false;
+ int ret;
if (nla == NULL)
return -EINVAL;
@@ -82,6 +103,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
switch (parm->eaction) {
case TCA_EGRESS_MIRROR:
case TCA_EGRESS_REDIR:
+ case TCA_INGRESS_REDIR:
+ case TCA_INGRESS_MIRROR:
break;
default:
if (exists)
@@ -95,19 +118,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
tcf_hash_release(*a, bind);
return -ENODEV;
}
- switch (dev->type) {
- case ARPHRD_TUNNEL:
- case ARPHRD_TUNNEL6:
- case ARPHRD_SIT:
- case ARPHRD_IPGRE:
- case ARPHRD_VOID:
- case ARPHRD_NONE:
- ok_push = 0;
- break;
- default:
- ok_push = 1;
- break;
- }
+ mac_header_xmit = dev_is_mac_header_xmit(dev);
} else {
dev = NULL;
}
@@ -136,7 +147,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
dev_hold(dev);
rcu_assign_pointer(m->tcfm_dev, dev);
- m->tcfm_ok_push = ok_push;
+ m->tcfm_mac_header_xmit = mac_header_xmit;
}
if (ret == ACT_P_CREATED) {
@@ -153,15 +164,20 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_mirred *m = to_mirred(a);
+ bool m_mac_header_xmit;
struct net_device *dev;
struct sk_buff *skb2;
- int retval, err;
+ int retval, err = 0;
+ int m_eaction;
+ int mac_len;
u32 at;
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
rcu_read_lock();
+ m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
+ m_eaction = READ_ONCE(m->tcfm_eaction);
retval = READ_ONCE(m->tcf_action);
dev = rcu_dereference(m->tcfm_dev);
if (unlikely(!dev)) {
@@ -180,23 +196,36 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
if (!skb2)
goto out;
- if (!(at & AT_EGRESS)) {
- if (m->tcfm_ok_push)
+ /* If action's target direction differs than filter's direction,
+ * and devices expect a mac header on xmit, then mac push/pull is
+ * needed.
+ */
+ if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) {
+ if (at & AT_EGRESS) {
+ /* caught at egress, act ingress: pull mac */
+ mac_len = skb_network_header(skb) - skb_mac_header(skb);
+ skb_pull_rcsum(skb2, mac_len);
+ } else {
+ /* caught at ingress, act egress: push mac */
skb_push_rcsum(skb2, skb->mac_len);
+ }
}
/* mirror is always swallowed */
- if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+ if (tcf_mirred_is_act_redirect(m_eaction))
skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
- err = dev_queue_xmit(skb2);
+ if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS)
+ err = dev_queue_xmit(skb2);
+ else
+ err = netif_receive_skb(skb2);
if (err) {
out:
qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
- if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+ if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
rcu_read_unlock();
@@ -204,7 +233,18 @@ out:
return retval;
}
-static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ struct tcf_t *tm = &m->tcf_tm;
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+ tm->lastuse = lastuse;
+}
+
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = to_mirred(a);
@@ -275,17 +315,30 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
+static int tcf_mirred_device(const struct tc_action *a, struct net *net,
+ struct net_device **mirred_dev)
+{
+ int ifindex = tcf_mirred_ifindex(a);
+
+ *mirred_dev = __dev_get_by_index(net, ifindex);
+ if (!*mirred_dev)
+ return -EINVAL;
+ return 0;
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
.owner = THIS_MODULE,
.act = tcf_mirred,
+ .stats_update = tcf_stats_update,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
.size = sizeof(struct tcf_mirred),
+ .get_dev = tcf_mirred_device,
};
static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 8e8b0cc30704..9b6aec665495 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,7 +31,7 @@
#define NAT_TAB_MASK 15
-static int nat_net_id;
+static unsigned int nat_net_id;
static struct tc_action_ops act_nat_ops;
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b54d56d4959b..b27c4daec88f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -25,7 +25,7 @@
#define PEDIT_TAB_MASK 15
-static int pedit_net_id;
+static unsigned int pedit_net_id;
static struct tc_action_ops act_pedit_ops;
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
@@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
kfree(keys);
}
+static bool offset_valid(struct sk_buff *skb, int offset)
+{
+ if (offset > 0 && offset > skb->len)
+ return false;
+
+ if (offset < 0 && -offset > skb_headroom(skb))
+ return false;
+
+ return true;
+}
+
static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
if (tkey->offmask) {
char *d, _d;
+ if (!offset_valid(skb, off + tkey->at)) {
+ pr_info("tc filter pedit 'at' offset %d out of bounds\n",
+ off + tkey->at);
+ goto bad;
+ }
d = skb_header_pointer(skb, off + tkey->at, 1,
&_d);
if (!d)
@@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
" offset must be on 32 bit boundaries\n");
goto bad;
}
- if (offset > 0 && offset > skb->len) {
- pr_info("tc filter pedit"
- " offset %d can't exceed pkt length %d\n",
- offset, skb->len);
+
+ if (!offset_valid(skb, off + offset)) {
+ pr_info("tc filter pedit offset %d out of bounds\n",
+ offset);
goto bad;
}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8a3be1d99775..0ba91d1ce994 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,7 +55,7 @@ struct tc_police_compat {
/* Each policer is serialized by its individual spinlock */
-static int police_net_id;
+static unsigned int police_net_id;
static struct tc_action_ops act_police_ops;
static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
- !gen_estimator_active(&police->tcf_bstats,
- &police->tcf_rate_est))) {
+ !gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
- if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+ if (police->tcfp_ewma_rate) {
+ struct gnet_stats_rate_est64 sample;
+
+ if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+ sample.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
@@ -249,6 +252,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
+ if (police->tcfp_result == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
@@ -261,8 +266,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
return police->tcf_action;
}
-static int
-tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = to_police(a);
@@ -347,14 +352,12 @@ static struct pernet_operations police_net_ops = {
.size = sizeof(struct tc_action_net),
};
-static int __init
-police_init_module(void)
+static int __init police_init_module(void)
{
return tcf_register_action(&act_police_ops, &police_net_ops);
}
-static void __exit
-police_cleanup_module(void)
+static void __exit police_cleanup_module(void)
{
tcf_unregister_action(&act_police_ops, &police_net_ops);
}
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 289af6f9bb3b..823a73ad0c60 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,7 +26,7 @@
#define SIMP_TAB_MASK 7
-static int simp_net_id;
+static unsigned int simp_net_id;
static struct tc_action_ops act_simp_ops;
#define SIMP_MAX_DATA 32
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index a133dcb82132..06ccae3c12ee 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,7 +29,7 @@
#define SKBEDIT_TAB_MASK 15
-static int skbedit_net_id;
+static unsigned int skbedit_net_id;
static struct tc_action_ops act_skbedit_ops;
static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
@@ -46,8 +46,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
skb->dev->real_num_tx_queues > d->queue_mapping)
skb_set_queue_mapping(skb, d->queue_mapping);
- if (d->flags & SKBEDIT_F_MARK)
- skb->mark = d->mark;
+ if (d->flags & SKBEDIT_F_MARK) {
+ skb->mark &= ~d->mask;
+ skb->mark |= d->mark & d->mask;
+ }
if (d->flags & SKBEDIT_F_PTYPE)
skb->pkt_type = d->ptype;
@@ -61,6 +63,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
[TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
+ [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -71,7 +74,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
- u32 flags = 0, *priority = NULL, *mark = NULL;
+ u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
u16 *queue_mapping = NULL, *ptype = NULL;
bool exists = false;
int ret = 0, err;
@@ -108,6 +111,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
}
+ if (tb[TCA_SKBEDIT_MASK] != NULL) {
+ flags |= SKBEDIT_F_MASK;
+ mask = nla_data(tb[TCA_SKBEDIT_MASK]);
+ }
+
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
exists = tcf_hash_check(tn, parm->index, a, bind);
@@ -145,6 +153,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
d->mark = *mark;
if (flags & SKBEDIT_F_PTYPE)
d->ptype = *ptype;
+ /* default behaviour is to use all the bits */
+ d->mask = 0xffffffff;
+ if (flags & SKBEDIT_F_MASK)
+ d->mask = *mask;
d->tcf_action = parm->action;
@@ -182,6 +194,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
if ((d->flags & SKBEDIT_F_PTYPE) &&
nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_MASK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
+ goto nla_put_failure;
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000000..3b7074e23024
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,301 @@
+/*
+ * net/sched/act_skbmod.c skb data modifier
+ *
+ * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+#define SKBMOD_TAB_MASK 15
+
+static unsigned int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+#define MAX_EDIT_LEN ETH_HLEN
+static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ int action;
+ struct tcf_skbmod_params *p;
+ u64 flags;
+ int err;
+
+ tcf_lastuse_update(&d->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+ /* XXX: if you are going to edit more fields beyond ethernet header
+ * (example when you add IP header replacement or vlan swap)
+ * then MAX_EDIT_LEN needs to change appropriately
+ */
+ err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+ if (unlikely(err)) { /* best policy is to drop on the floor */
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+
+ rcu_read_lock();
+ action = READ_ONCE(d->tcf_action);
+ if (unlikely(action == TC_ACT_SHOT)) {
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ rcu_read_unlock();
+ return action;
+ }
+
+ p = rcu_dereference(d->skbmod_p);
+ flags = p->flags;
+ if (flags & SKBMOD_F_DMAC)
+ ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
+ if (flags & SKBMOD_F_SMAC)
+ ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
+ if (flags & SKBMOD_F_ETYPE)
+ eth_hdr(skb)->h_proto = p->eth_type;
+ rcu_read_unlock();
+
+ if (flags & SKBMOD_F_SWAPMAC) {
+ u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
+ /*XXX: I am sure we can come up with more efficient swapping*/
+ ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
+ ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+ ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
+ }
+
+ return action;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+ [TCA_SKBMOD_PARMS] = { .len = sizeof(struct tc_skbmod) },
+ [TCA_SKBMOD_DMAC] = { .len = ETH_ALEN },
+ [TCA_SKBMOD_SMAC] = { .len = ETH_ALEN },
+ [TCA_SKBMOD_ETYPE] = { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+ struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+ struct tcf_skbmod_params *p, *p_old;
+ struct tc_skbmod *parm;
+ struct tcf_skbmod *d;
+ bool exists = false;
+ u8 *daddr = NULL;
+ u8 *saddr = NULL;
+ u16 eth_type = 0;
+ u32 lflags = 0;
+ int ret = 0, err;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_SKBMOD_PARMS])
+ return -EINVAL;
+
+ if (tb[TCA_SKBMOD_DMAC]) {
+ daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+ lflags |= SKBMOD_F_DMAC;
+ }
+
+ if (tb[TCA_SKBMOD_SMAC]) {
+ saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+ lflags |= SKBMOD_F_SMAC;
+ }
+
+ if (tb[TCA_SKBMOD_ETYPE]) {
+ eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+ lflags |= SKBMOD_F_ETYPE;
+ }
+
+ parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+ if (parm->flags & SKBMOD_F_SWAPMAC)
+ lflags = SKBMOD_F_SWAPMAC;
+
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ if (!lflags)
+ return -EINVAL;
+
+ if (!exists) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ &act_skbmod_ops, bind, true);
+ if (ret)
+ return ret;
+
+ ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ d = to_skbmod(*a);
+
+ ASSERT_RTNL();
+ p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
+ if (unlikely(!p)) {
+ if (ovr)
+ tcf_hash_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ p->flags = lflags;
+ d->tcf_action = parm->action;
+
+ p_old = rtnl_dereference(d->skbmod_p);
+
+ if (ovr)
+ spin_lock_bh(&d->tcf_lock);
+
+ if (lflags & SKBMOD_F_DMAC)
+ ether_addr_copy(p->eth_dst, daddr);
+ if (lflags & SKBMOD_F_SMAC)
+ ether_addr_copy(p->eth_src, saddr);
+ if (lflags & SKBMOD_F_ETYPE)
+ p->eth_type = htons(eth_type);
+
+ rcu_assign_pointer(d->skbmod_p, p);
+ if (ovr)
+ spin_unlock_bh(&d->tcf_lock);
+
+ if (p_old)
+ kfree_rcu(p_old, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(tn, *a);
+ return ret;
+}
+
+static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ struct tcf_skbmod_params *p;
+
+ p = rcu_dereference_protected(d->skbmod_p, 1);
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
+ struct tc_skbmod opt = {
+ .index = d->tcf_index,
+ .refcnt = d->tcf_refcnt - ref,
+ .bindcnt = d->tcf_bindcnt - bind,
+ .action = d->tcf_action,
+ };
+ struct tcf_t t;
+
+ opt.flags = p->flags;
+ if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_DMAC) &&
+ nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_SMAC) &&
+ nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_ETYPE) &&
+ nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &d->tcf_tm);
+ if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+ goto nla_put_failure;
+
+ return skb->len;
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+ .kind = "skbmod",
+ .type = TCA_ACT_SKBMOD,
+ .owner = THIS_MODULE,
+ .act = tcf_skbmod_run,
+ .dump = tcf_skbmod_dump,
+ .init = tcf_skbmod_init,
+ .cleanup = tcf_skbmod_cleanup,
+ .walk = tcf_skbmod_walker,
+ .lookup = tcf_skbmod_search,
+ .size = sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+}
+
+static void __net_exit skbmod_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+ .init = skbmod_init_net,
+ .exit = skbmod_exit_net,
+ .id = &skbmod_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+ return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+ tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000000..e3a58e021198
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#define TUNNEL_KEY_TAB_MASK 15
+
+static unsigned int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+ int action;
+
+ rcu_read_lock();
+
+ params = rcu_dereference(t->params);
+
+ tcf_lastuse_update(&t->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+ action = params->action;
+
+ switch (params->tcft_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ skb_dst_drop(skb);
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+ break;
+ default:
+ WARN_ONCE(1, "Bad tunnel_key action %d.\n",
+ params->tcft_action);
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return action;
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+ [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
+ [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
+};
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+ struct tcf_tunnel_key_params *params_old;
+ struct tcf_tunnel_key_params *params_new;
+ struct metadata_dst *metadata = NULL;
+ struct tc_tunnel_key *parm;
+ struct tcf_tunnel_key *t;
+ bool exists = false;
+ __be16 dst_port = 0;
+ __be64 key_id;
+ int ret = 0;
+ int err;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ switch (parm->t_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+ if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
+ dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
+
+ if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+ tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+ __be32 saddr;
+ __be32 daddr;
+
+ saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+ daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+ metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+ dst_port, TUNNEL_KEY,
+ key_id, 0);
+ } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+ tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+
+ saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+ daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+ 0, TUNNEL_KEY,
+ key_id, 0);
+ }
+
+ if (!metadata) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+ break;
+ default:
+ goto err_out;
+ }
+
+ if (!exists) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ &act_tunnel_key_ops, bind, true);
+ if (ret)
+ return ret;
+
+ ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ t = to_tunnel_key(*a);
+
+ ASSERT_RTNL();
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ if (ret == ACT_P_CREATED)
+ tcf_hash_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ params_old = rtnl_dereference(t->params);
+
+ params_new->action = parm->action;
+ params_new->tcft_action = parm->t_action;
+ params_new->tcft_enc_metadata = metadata;
+
+ rcu_assign_pointer(t->params, params_new);
+
+ if (params_old)
+ kfree_rcu(params_old, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(tn, *a);
+
+ return ret;
+
+err_out:
+ if (exists)
+ tcf_hash_release(*a, bind);
+ return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a, int bind)
+{
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+
+ params = rcu_dereference_protected(t->params, 1);
+
+ if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+ dst_release(&params->tcft_enc_metadata->dst);
+
+ kfree_rcu(params, rcu);
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ unsigned short family = ip_tunnel_info_af(info);
+
+ if (family == AF_INET) {
+ __be32 saddr = info->key.u.ipv4.src;
+ __be32 daddr = info->key.u.ipv4.dst;
+
+ if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+ !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+ return 0;
+ }
+
+ if (family == AF_INET6) {
+ const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+ const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+ if (!nla_put_in6_addr(skb,
+ TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+ !nla_put_in6_addr(skb,
+ TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+ struct tc_tunnel_key opt = {
+ .index = t->tcf_index,
+ .refcnt = t->tcf_refcnt - ref,
+ .bindcnt = t->tcf_bindcnt - bind,
+ };
+ struct tcf_t tm;
+
+ params = rtnl_dereference(t->params);
+
+ opt.t_action = params->tcft_action;
+ opt.action = params->action;
+
+ if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+ struct ip_tunnel_key *key =
+ &params->tcft_enc_metadata->u.tun_info.key;
+ __be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+ if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+ tunnel_key_dump_addresses(skb,
+ &params->tcft_enc_metadata->u.tun_info) ||
+ nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst))
+ goto nla_put_failure;
+ }
+
+ tcf_tm_dump(&tm, &t->tcf_tm);
+ if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+ &tm, TCA_TUNNEL_KEY_PAD))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+ .kind = "tunnel_key",
+ .type = TCA_ACT_TUNNEL_KEY,
+ .owner = THIS_MODULE,
+ .act = tunnel_key_act,
+ .dump = tunnel_key_dump,
+ .init = tunnel_key_init,
+ .cleanup = tunnel_key_release,
+ .walk = tunnel_key_walker,
+ .lookup = tunnel_key_search,
+ .size = sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+}
+
+static void __net_exit tunnel_key_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+ .init = tunnel_key_init_net,
+ .exit = tunnel_key_exit_net,
+ .id = &tunnel_key_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+ return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+ tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 691409de3e1a..19e0dba305ce 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,7 +21,7 @@
#define VLAN_TAB_MASK 15
-static int vlan_net_id;
+static unsigned int vlan_net_id;
static struct tc_action_ops act_vlan_ops;
static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
@@ -30,12 +30,19 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
struct tcf_vlan *v = to_vlan(a);
int action;
int err;
+ u16 tci;
spin_lock(&v->tcf_lock);
tcf_lastuse_update(&v->tcf_tm);
bstats_update(&v->tcf_bstats, skb);
action = v->tcf_action;
+ /* Ensure 'data' points at mac_header prior calling vlan manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb))
+ skb_push_rcsum(skb, skb->mac_len);
+
switch (v->tcfv_action) {
case TCA_VLAN_ACT_POP:
err = skb_vlan_pop(skb);
@@ -43,10 +50,35 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
goto drop;
break;
case TCA_VLAN_ACT_PUSH:
- err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid);
+ err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid |
+ (v->tcfv_push_prio << VLAN_PRIO_SHIFT));
if (err)
goto drop;
break;
+ case TCA_VLAN_ACT_MODIFY:
+ /* No-op if no vlan tag (either hw-accel or in-payload) */
+ if (!skb_vlan_tagged(skb))
+ goto unlock;
+ /* extract existing tag (and guarantee no hw-accel tag) */
+ if (skb_vlan_tag_present(skb)) {
+ tci = skb_vlan_tag_get(skb);
+ skb->vlan_tci = 0;
+ } else {
+ /* in-payload vlan tag, pop it */
+ err = __skb_vlan_pop(skb, &tci);
+ if (err)
+ goto drop;
+ }
+ /* replace the vid */
+ tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+ /* replace prio bits, if tcfv_push_prio specified */
+ if (v->tcfv_push_prio) {
+ tci &= ~VLAN_PRIO_MASK;
+ tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+ }
+ /* put updated tci as hwaccel tag */
+ __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+ break;
default:
BUG();
}
@@ -57,6 +89,9 @@ drop:
action = TC_ACT_SHOT;
v->tcf_qstats.drops++;
unlock:
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
spin_unlock(&v->tcf_lock);
return action;
}
@@ -65,6 +100,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
[TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
[TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
+ [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 },
};
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
@@ -78,6 +114,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
int action;
__be16 push_vid = 0;
__be16 push_proto = 0;
+ u8 push_prio = 0;
bool exists = false;
int ret = 0, err;
@@ -99,6 +136,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
case TCA_VLAN_ACT_POP:
break;
case TCA_VLAN_ACT_PUSH:
+ case TCA_VLAN_ACT_MODIFY:
if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
if (exists)
tcf_hash_release(*a, bind);
@@ -123,6 +161,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
} else {
push_proto = htons(ETH_P_8021Q);
}
+
+ if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY])
+ push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
break;
default:
if (exists)
@@ -150,6 +191,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
v->tcfv_action = action;
v->tcfv_push_vid = push_vid;
+ v->tcfv_push_prio = push_prio;
v->tcfv_push_proto = push_proto;
v->tcf_action = parm->action;
@@ -178,10 +220,13 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+ if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
+ v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
(nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
- v->tcfv_push_proto)))
+ v->tcfv_push_proto) ||
+ (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY,
+ v->tcfv_push_prio))))
goto nla_put_failure;
tcf_tm_dump(&t, &v->tcf_tm);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a7c5645373af..1ecdf809b5fa 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(unregister_tcf_proto_ops);
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
- unsigned long fh, int event);
+ unsigned long fh, int event, bool unicast);
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n,
@@ -112,7 +112,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL;
it_chain = &tp->next)
- tfilter_notify(net, oskb, n, tp, 0, event);
+ tfilter_notify(net, oskb, n, tp, 0, event, false);
}
/* Select new prio value from the range, managed by kernel. */
@@ -148,13 +148,15 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
unsigned long cl;
unsigned long fh;
int err;
- int tp_created = 0;
+ int tp_created;
if ((n->nlmsg_type != RTM_GETTFILTER) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
replay:
+ tp_created = 0;
+
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
@@ -319,7 +321,8 @@ replay:
RCU_INIT_POINTER(*back, next);
- tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ tfilter_notify(net, skb, n, tp, fh,
+ RTM_DELTFILTER, false);
tcf_destroy(tp, true);
err = 0;
goto errout;
@@ -344,13 +347,16 @@ replay:
if (err == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
- tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ tfilter_notify(net, skb, n, tp,
+ t->tcm_handle,
+ RTM_DELTFILTER, false);
if (tcf_destroy(tp, false))
RCU_INIT_POINTER(*back, next);
}
goto errout;
case RTM_GETTFILTER:
- err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+ err = tfilter_notify(net, skb, n, tp, fh,
+ RTM_NEWTFILTER, true);
goto errout;
default:
err = -EINVAL;
@@ -365,7 +371,7 @@ replay:
RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
rcu_assign_pointer(*back, tp);
}
- tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+ tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
} else {
if (tp_created)
tcf_destroy(tp, true);
@@ -417,7 +423,7 @@ nla_put_failure:
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
- unsigned long fh, int event)
+ unsigned long fh, int event, bool unicast)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -426,11 +432,15 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
if (!skb)
return -ENOBUFS;
- if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
+ n->nlmsg_flags, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
}
@@ -448,7 +458,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct net *net = sock_net(a->skb->sk);
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTFILTER);
}
/* called with RTNL */
@@ -552,7 +563,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
EXPORT_SYMBOL(tcf_exts_destroy);
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
- struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
+ struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
{
#ifdef CONFIG_NET_CLS_ACT
{
@@ -560,8 +571,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
- "police", ovr,
- TCA_ACT_BIND);
+ "police", ovr, TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -573,8 +583,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
int err, i = 0;
err = tcf_action_init(net, tb[exts->action], rate_tlv,
- NULL, ovr,
- TCA_ACT_BIND, &actions);
+ NULL, ovr, TCA_ACT_BIND,
+ &actions);
if (err)
return err;
list_for_each_entry(act, &actions, list)
@@ -674,6 +684,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+ struct net_device **hw_dev)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ const struct tc_action *a;
+ LIST_HEAD(actions);
+
+ if (tc_no_actions(exts))
+ return -EINVAL;
+
+ tcf_exts_to_list(exts, &actions);
+ list_for_each_entry(a, &actions, list) {
+ if (a->ops->get_dev) {
+ a->ops->get_dev(a, dev_net(dev), hw_dev);
+ break;
+ }
+ }
+ if (*hw_dev)
+ return 0;
+#endif
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(tcf_exts_get_dev);
+
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 0b8c3ace671f..5877f6061b57 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -62,9 +62,6 @@ static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
- if (head == NULL)
- return 0UL;
-
list_for_each_entry(f, &head->flist, link) {
if (f->handle == handle) {
l = (unsigned long) f;
@@ -109,7 +106,6 @@ static bool basic_destroy(struct tcf_proto *tp, bool force)
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, basic_delete_filter);
}
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
@@ -138,10 +134,12 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
struct tcf_exts e;
struct tcf_ematch_tree t;
- tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ err = tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ goto errout;
err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
if (err < 0)
@@ -189,7 +187,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
if (!fnew)
return -ENOBUFS;
- tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ if (err < 0)
+ goto errout;
+
err = -EINVAL;
if (handle) {
fnew->handle = handle;
@@ -226,6 +227,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout:
+ tcf_exts_destroy(&fnew->exts);
kfree(fnew);
return err;
}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c3002c2c68bb..d9c97018317d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");
#define CLS_BPF_NAME_LEN 256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS \
+ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
struct cls_bpf_head {
struct list_head plist;
@@ -39,12 +41,11 @@ struct cls_bpf_prog {
struct list_head link;
struct tcf_result res;
bool exts_integrated;
+ bool offloaded;
+ u32 gen_flags;
struct tcf_exts exts;
u32 handle;
- union {
- u32 bpf_fd;
- u16 bpf_num_ops;
- };
+ u16 bpf_num_ops;
struct sock_filter *bpf_ops;
const char *bpf_name;
struct tcf_proto *tp;
@@ -54,8 +55,10 @@ struct cls_bpf_prog {
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
[TCA_BPF_CLASSID] = { .type = NLA_U32 },
[TCA_BPF_FLAGS] = { .type = NLA_U32 },
+ [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 },
[TCA_BPF_FD] = { .type = NLA_U32 },
- [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
+ [TCA_BPF_NAME] = { .type = NLA_NUL_STRING,
+ .len = CLS_BPF_NAME_LEN },
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
[TCA_BPF_OPS] = { .type = NLA_BINARY,
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -83,9 +86,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_bpf_prog *prog;
int ret = -1;
- if (unlikely(!skb_mac_header_was_set(skb)))
- return -1;
-
/* Needed here for accessing maps. */
rcu_read_lock();
list_for_each_entry_rcu(prog, &head->plist, link) {
@@ -93,7 +93,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
- if (at_ingress) {
+ if (tc_skip_sw(prog->gen_flags)) {
+ filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
+ } else if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
bpf_compute_data_end(skb);
@@ -140,6 +142,91 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
return !prog->bpf_ops;
}
+static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ enum tc_clsbpf_command cmd)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_bpf_offload bpf_offload = {};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSBPF;
+ offload.cls_bpf = &bpf_offload;
+
+ bpf_offload.command = cmd;
+ bpf_offload.exts = &prog->exts;
+ bpf_offload.prog = prog->filter;
+ bpf_offload.name = prog->bpf_name;
+ bpf_offload.exts_integrated = prog->exts_integrated;
+ bpf_offload.gen_flags = prog->gen_flags;
+
+ return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+}
+
+static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ struct cls_bpf_prog *oldprog)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct cls_bpf_prog *obj = prog;
+ enum tc_clsbpf_command cmd;
+ bool skip_sw;
+ int ret;
+
+ skip_sw = tc_skip_sw(prog->gen_flags) ||
+ (oldprog && tc_skip_sw(oldprog->gen_flags));
+
+ if (oldprog && oldprog->offloaded) {
+ if (tc_should_offload(dev, tp, prog->gen_flags)) {
+ cmd = TC_CLSBPF_REPLACE;
+ } else if (!tc_skip_sw(prog->gen_flags)) {
+ obj = oldprog;
+ cmd = TC_CLSBPF_DESTROY;
+ } else {
+ return -EINVAL;
+ }
+ } else {
+ if (!tc_should_offload(dev, tp, prog->gen_flags))
+ return skip_sw ? -EINVAL : 0;
+ cmd = TC_CLSBPF_ADD;
+ }
+
+ ret = cls_bpf_offload_cmd(tp, obj, cmd);
+ if (ret)
+ return skip_sw ? ret : 0;
+
+ obj->offloaded = true;
+ if (oldprog)
+ oldprog->offloaded = false;
+
+ return 0;
+}
+
+static void cls_bpf_stop_offload(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ int err;
+
+ if (!prog->offloaded)
+ return;
+
+ err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+ if (err) {
+ pr_err("Stopping hardware offload failed: %d\n", err);
+ return;
+ }
+
+ prog->offloaded = false;
+}
+
+static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ if (!prog->offloaded)
+ return;
+
+ cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+}
+
static int cls_bpf_init(struct tcf_proto *tp)
{
struct cls_bpf_head *head;
@@ -154,7 +241,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
return 0;
}
-static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
{
tcf_exts_destroy(&prog->exts);
@@ -168,21 +255,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
kfree(prog);
}
-static void __cls_bpf_delete_prog(struct rcu_head *rcu)
+static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
{
- struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
- cls_bpf_delete_prog(prog->tp, prog);
+ __cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
}
-static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
{
- struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
-
+ cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
- call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+ call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+}
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ __cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
return 0;
}
@@ -194,13 +282,9 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
if (!force && !list_empty(&head->plist))
return false;
- list_for_each_entry_safe(prog, tmp, &head->plist, link) {
- list_del_rcu(&prog->link);
- tcf_unbind_filter(tp, &prog->res);
- call_rcu(&prog->rcu, __cls_bpf_delete_prog);
- }
+ list_for_each_entry_safe(prog, tmp, &head->plist, link)
+ __cls_bpf_delete(tp, prog);
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
@@ -211,9 +295,6 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
struct cls_bpf_prog *prog;
unsigned long ret = 0UL;
- if (head == NULL)
- return 0UL;
-
list_for_each_entry(prog, &head->plist, link) {
if (prog->handle == handle) {
ret = (unsigned long) prog;
@@ -277,9 +358,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
return PTR_ERR(fp);
if (tb[TCA_BPF_NAME]) {
- name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
- nla_len(tb[TCA_BPF_NAME]),
- GFP_KERNEL);
+ name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL);
if (!name) {
bpf_prog_put(fp);
return -ENOMEM;
@@ -287,7 +366,6 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
}
prog->bpf_ops = NULL;
- prog->bpf_fd = bpf_fd;
prog->bpf_name = name;
prog->filter = fp;
@@ -304,6 +382,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
{
bool is_bpf, is_ebpf, have_exts = false;
struct tcf_exts exts;
+ u32 gen_flags = 0;
int ret;
is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -311,30 +390,39 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
return -EINVAL;
- tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
- ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+ ret = tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
if (ret < 0)
return ret;
+ ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+ if (ret < 0)
+ goto errout;
if (tb[TCA_BPF_FLAGS]) {
u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
- tcf_exts_destroy(&exts);
- return -EINVAL;
+ ret = -EINVAL;
+ goto errout;
}
have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
}
+ if (tb[TCA_BPF_FLAGS_GEN]) {
+ gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+ if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+ !tc_flags_valid(gen_flags)) {
+ ret = -EINVAL;
+ goto errout;
+ }
+ }
prog->exts_integrated = have_exts;
+ prog->gen_flags = gen_flags;
ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
cls_bpf_prog_from_efd(tb, prog, tp);
- if (ret < 0) {
- tcf_exts_destroy(&exts);
- return ret;
- }
+ if (ret < 0)
+ goto errout;
if (tb[TCA_BPF_CLASSID]) {
prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
@@ -343,6 +431,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
tcf_exts_change(tp, &prog->exts, &exts);
return 0;
+
+errout:
+ tcf_exts_destroy(&exts);
+ return ret;
}
static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
@@ -388,7 +480,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
if (!prog)
return -ENOBUFS;
- tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ if (ret < 0)
+ goto errout;
if (oldprog) {
if (handle && oldprog->handle != handle) {
@@ -406,23 +500,31 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+ ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE],
+ ovr);
if (ret < 0)
goto errout;
+ ret = cls_bpf_offload(tp, prog, oldprog);
+ if (ret) {
+ __cls_bpf_delete_prog(prog);
+ return ret;
+ }
+
if (oldprog) {
list_replace_rcu(&oldprog->link, &prog->link);
tcf_unbind_filter(tp, &oldprog->res);
- call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
+ call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
} else {
list_add_rcu(&prog->link, &head->plist);
}
*arg = (unsigned long) prog;
return 0;
+
errout:
+ tcf_exts_destroy(&prog->exts);
kfree(prog);
-
return ret;
}
@@ -447,13 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
struct sk_buff *skb)
{
- if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd))
- return -EMSGSIZE;
+ struct nlattr *nla;
if (prog->bpf_name &&
nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
return -EMSGSIZE;
+ nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
+
return 0;
}
@@ -470,6 +577,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
tm->tcm_handle = prog->handle;
+ cls_bpf_offload_update_stats(tp, prog);
+
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -492,6 +601,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
goto nla_put_failure;
+ if (prog->gen_flags &&
+ nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 4c85bd3a750c..c1f20077837f 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -93,7 +93,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
if (!new)
return -ENOBUFS;
- tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ if (err < 0)
+ goto errout;
new->handle = handle;
new->tp = tp;
err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
@@ -101,10 +103,14 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
goto errout;
- tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ err = tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
if (err < 0)
goto errout;
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0) {
+ tcf_exts_destroy(&e);
+ goto errout;
+ }
err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
if (err < 0) {
@@ -120,6 +126,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
return 0;
errout:
+ tcf_exts_destroy(&new->exts);
kfree(new);
return err;
}
@@ -130,11 +137,10 @@ static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force)
if (!force)
return false;
-
- if (head) {
- RCU_INIT_POINTER(tp->root, NULL);
+ /* Head can still be NULL due to cls_cgroup_init(). */
+ if (head)
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
- }
+
return true;
}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index fbfec6a18839..6575aba87630 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -29,7 +29,7 @@
#include <net/route.h>
#include <net/flow_dissector.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif
@@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
}
-static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return flow->basic.ip_proto;
}
-static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
if (flow->ports.ports)
return ntohs(flow->ports.src);
@@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
return addr_fold(skb->sk);
}
-static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
if (flow->ports.ports)
return ntohs(flow->ports.dst);
@@ -125,14 +128,14 @@ static u32 flow_get_mark(const struct sk_buff *skb)
static u32 flow_get_nfct(const struct sk_buff *skb)
{
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
return addr_fold(skb->nfct);
#else
return 0;
#endif
}
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#define CTTUPLE(skb, member) \
({ \
enum ip_conntrack_info ctinfo; \
@@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
@@ -161,7 +165,8 @@ fallback:
return flow_get_src(skb, flow);
}
-static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
@@ -173,14 +178,16 @@ fallback:
return flow_get_dst(skb, flow);
}
-static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
return flow_get_proto_src(skb, flow);
}
-static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
@@ -418,10 +425,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
return -EOPNOTSUPP;
}
- tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ err = tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ if (err < 0)
+ goto err1;
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
- return err;
+ goto err1;
err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
if (err < 0)
@@ -432,13 +441,15 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (!fnew)
goto err2;
- tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ if (err < 0)
+ goto err3;
fold = (struct flow_filter *)*arg;
if (fold) {
err = -EINVAL;
if (fold->handle != handle && handle)
- goto err2;
+ goto err3;
/* Copy fold into fnew */
fnew->tp = fold->tp;
@@ -458,31 +469,31 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
- goto err2;
+ goto err3;
if (mode == FLOW_MODE_HASH)
perturb_period = fold->perturb_period;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
- goto err2;
+ goto err3;
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
}
} else {
err = -EINVAL;
if (!handle)
- goto err2;
+ goto err3;
if (!tb[TCA_FLOW_KEYS])
- goto err2;
+ goto err3;
mode = FLOW_MODE_MAP;
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
- goto err2;
+ goto err3;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
- goto err2;
+ goto err3;
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
}
@@ -542,6 +553,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
call_rcu(&fold->rcu, flow_destroy_filter);
return 0;
+err3:
+ tcf_exts_destroy(&fnew->exts);
err2:
tcf_em_tree_destroy(&t);
kfree(fnew);
@@ -583,7 +596,6 @@ static bool flow_destroy(struct tcf_proto *tp, bool force)
list_del_rcu(&f->list);
call_rcu(&f->rcu, flow_destroy_filter);
}
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5060801a2f6d..5752789acc13 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/rhashtable.h>
+#include <linux/workqueue.h>
#include <linux/if_ether.h>
#include <linux/in6.h>
@@ -23,17 +24,28 @@
#include <net/ip.h>
#include <net/flow_dissector.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
+ struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
- struct flow_dissector_key_addrs ipaddrs;
+ struct flow_dissector_key_vlan vlan;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
};
struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_icmp icmp;
+ struct flow_dissector_key_keyid enc_key_id;
+ union {
+ struct flow_dissector_key_ipv4_addrs enc_ipv4;
+ struct flow_dissector_key_ipv6_addrs enc_ipv6;
+ };
+ struct flow_dissector_key_ports enc_tp;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -55,7 +67,10 @@ struct cls_fl_head {
bool mask_assigned;
struct list_head filters;
struct rhashtable_params ht_params;
- struct rcu_head rcu;
+ union {
+ struct work_struct work;
+ struct rcu_head rcu;
+ };
};
struct cls_fl_filter {
@@ -68,6 +83,8 @@ struct cls_fl_filter {
u32 handle;
u32 flags;
struct rcu_head rcu;
+ struct tc_to_netdev tc;
+ struct net_device *hw_dev;
};
static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -123,11 +140,37 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_fl_filter *f;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+ struct ip_tunnel_info *info;
if (!atomic_read(&head->ht.nelems))
return -1;
fl_clear_masked_range(&skb_key, &head->mask);
+
+ info = skb_tunnel_info(skb);
+ if (info) {
+ struct ip_tunnel_key *key = &info->key;
+
+ switch (ip_tunnel_info_af(info)) {
+ case AF_INET:
+ skb_key.enc_control.addr_type =
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ skb_key.enc_ipv4.src = key->u.ipv4.src;
+ skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+ break;
+ case AF_INET6:
+ skb_key.enc_control.addr_type =
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ skb_key.enc_ipv6.src = key->u.ipv6.src;
+ skb_key.enc_ipv6.dst = key->u.ipv6.dst;
+ break;
+ }
+
+ skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+ skb_key.enc_tp.src = key->tp_src;
+ skb_key.enc_tp.dst = key->tp_dst;
+ }
+
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
* so do it rather here.
@@ -169,74 +212,108 @@ static void fl_destroy_filter(struct rcu_head *head)
kfree(f);
}
-static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_DESTROY;
- offload.cookie = cookie;
+ offload.cookie = (unsigned long)f;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
struct flow_dissector *dissector,
struct fl_flow_key *mask,
- struct fl_flow_key *key,
- struct tcf_exts *actions,
- unsigned long cookie, u32 flags)
+ struct cls_fl_filter *f)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct tc_to_netdev *tc = &f->tc;
int err;
- if (!tc_should_offload(dev, tp, flags))
- return tc_skip_sw(flags) ? -EINVAL : 0;
+ if (!tc_can_offload(dev, tp)) {
+ if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
+ (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) {
+ f->hw_dev = dev;
+ return tc_skip_sw(f->flags) ? -EINVAL : 0;
+ }
+ dev = f->hw_dev;
+ tc->egress_dev = true;
+ } else {
+ f->hw_dev = dev;
+ }
offload.command = TC_CLSFLOWER_REPLACE;
- offload.cookie = cookie;
+ offload.cookie = (unsigned long)f;
offload.dissector = dissector;
offload.mask = mask;
- offload.key = key;
- offload.exts = actions;
+ offload.key = &f->mkey;
+ offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+ tc);
- if (tc_skip_sw(flags))
+ if (tc_skip_sw(f->flags))
return err;
-
return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_STATS;
offload.cookie = (unsigned long)f;
offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
+}
+
+static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
+{
+ list_del_rcu(&f->list);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, f);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, fl_destroy_filter);
+}
+
+static void fl_destroy_sleepable(struct work_struct *work)
+{
+ struct cls_fl_head *head = container_of(work, struct cls_fl_head,
+ work);
+ if (head->mask_assigned)
+ rhashtable_destroy(&head->ht);
+ kfree(head);
+ module_put(THIS_MODULE);
+}
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+static void fl_destroy_rcu(struct rcu_head *rcu)
+{
+ struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
+
+ INIT_WORK(&head->work, fl_destroy_sleepable);
+ schedule_work(&head->work);
}
static bool fl_destroy(struct tcf_proto *tp, bool force)
@@ -247,15 +324,12 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
if (!force && !list_empty(&head->filters))
return false;
- list_for_each_entry_safe(f, next, &head->filters, list) {
- fl_hw_destroy_filter(tp, (unsigned long)f);
- list_del_rcu(&f->list);
- call_rcu(&f->rcu, fl_destroy_filter);
- }
- RCU_INIT_POINTER(tp->root, NULL);
- if (head->mask_assigned)
- rhashtable_destroy(&head->ht);
- kfree_rcu(head, rcu);
+ list_for_each_entry_safe(f, next, &head->filters, list)
+ __fl_delete(tp, f);
+
+ __module_get(THIS_MODULE);
+ call_rcu(&head->rcu, fl_destroy_rcu);
+
return true;
}
@@ -293,6 +367,40 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -308,9 +416,62 @@ static void fl_set_key_val(struct nlattr **tb,
memcpy(mask, nla_data(tb[mask_type]), len);
}
+static void fl_set_key_vlan(struct nlattr **tb,
+ struct flow_dissector_key_vlan *key_val,
+ struct flow_dissector_key_vlan *key_mask)
+{
+#define VLAN_PRIORITY_MASK 0x7
+
+ if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+ key_val->vlan_id =
+ nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+ key_mask->vlan_id = VLAN_VID_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+ key_val->vlan_priority =
+ nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
+ VLAN_PRIORITY_MASK;
+ key_mask->vlan_priority = VLAN_PRIORITY_MASK;
+ }
+}
+
+static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
+ u32 *dissector_key, u32 *dissector_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (flower_mask & flower_flag_bit) {
+ *dissector_mask |= dissector_flag_bit;
+ if (flower_key & flower_flag_bit)
+ *dissector_key |= dissector_flag_bit;
+ }
+}
+
+static int fl_set_key_flags(struct nlattr **tb,
+ u32 *flags_key, u32 *flags_mask)
+{
+ u32 key, mask;
+
+ /* mask is mandatory for flags */
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ return -EINVAL;
+
+ key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
+ mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+
+ *flags_key = 0;
+ *flags_mask = 0;
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+
+ return 0;
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
+ __be16 ethertype;
+ int ret = 0;
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -328,9 +489,20 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
sizeof(key->eth.src));
- fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
- &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
- sizeof(key->basic.n_proto));
+ if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
+ ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
+
+ if (ethertype == htons(ETH_P_8021Q)) {
+ fl_set_key_vlan(tb, &key->vlan, &mask->vlan);
+ fl_set_key_val(tb, &key->basic.n_proto,
+ TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto));
+ } else {
+ key->basic.n_proto = ethertype;
+ mask->basic.n_proto = cpu_to_be16(~0);
+ }
+ }
if (key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) {
@@ -341,6 +513,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ mask->control.addr_type = ~0;
fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
&mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
sizeof(key->ipv4.src));
@@ -349,6 +522,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
sizeof(key->ipv4.dst));
} else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) {
key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ mask->control.addr_type = ~0;
fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
&mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
sizeof(key->ipv6.src));
@@ -359,21 +533,95 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (key->basic.ip_proto == IPPROTO_TCP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
sizeof(key->tp.src));
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
sizeof(key->tp.dst));
} else if (key->basic.ip_proto == IPPROTO_UDP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
sizeof(key->tp.src));
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
+ sizeof(key->tp.dst));
+ } else if (key->basic.ip_proto == IPPROTO_SCTP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
sizeof(key->tp.dst));
+ } else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code));
+ } else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code));
}
- return 0;
+ if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+ tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ mask->enc_control.addr_type = ~0;
+ fl_set_key_val(tb, &key->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC,
+ &mask->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ sizeof(key->enc_ipv4.src));
+ fl_set_key_val(tb, &key->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST,
+ &mask->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ sizeof(key->enc_ipv4.dst));
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
+ tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ mask->enc_control.addr_type = ~0;
+ fl_set_key_val(tb, &key->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC,
+ &mask->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ sizeof(key->enc_ipv6.src));
+ fl_set_key_val(tb, &key->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ &mask->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ sizeof(key->enc_ipv6.dst));
+ }
+
+ fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC,
+ sizeof(key->enc_key_id.keyid));
+
+ fl_set_key_val(tb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+ &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+ sizeof(key->enc_tp.src));
+
+ fl_set_key_val(tb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+ &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+ sizeof(key->enc_tp.dst));
+
+ if (tb[TCA_FLOWER_KEY_FLAGS])
+ ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+
+ return ret;
}
static bool fl_mask_eq(struct fl_flow_mask *mask1,
@@ -404,12 +652,10 @@ static int fl_init_hashtable(struct cls_fl_head *head,
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member))
-#define FL_KEY_MEMBER_END_OFFSET(member) \
- (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member))
-#define FL_KEY_IN_RANGE(mask, member) \
- (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \
- FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start)
+#define FL_KEY_IS_MASKED(mask, member) \
+ memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
+ 0, FL_KEY_MEMBER_SIZE(member)) \
#define FL_KEY_SET(keys, cnt, id, member) \
do { \
@@ -418,9 +664,9 @@ static int fl_init_hashtable(struct cls_fl_head *head,
cnt++; \
} while(0);
-#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \
+#define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \
do { \
- if (FL_KEY_IN_RANGE(mask, member)) \
+ if (FL_KEY_IS_MASKED(mask, member)) \
FL_KEY_SET(keys, cnt, id, member); \
} while(0);
@@ -432,14 +678,30 @@ static void fl_init_dissector(struct cls_fl_head *head,
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
- FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
- FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
- FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
- FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
- FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
- FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ICMP, icmp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_VLAN, vlan);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
+ if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) ||
+ FL_KEY_IS_MASKED(&mask->key, enc_ipv6))
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
+ enc_control);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
skb_flow_dissector_init(&head->dissector, keys, cnt);
}
@@ -478,10 +740,12 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
struct tcf_exts e;
int err;
- tcf_exts_init(&e, TCA_FLOWER_ACT, 0);
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ err = tcf_exts_init(&e, TCA_FLOWER_ACT, 0);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ goto errout;
if (tb[TCA_FLOWER_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
@@ -550,7 +814,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (!fnew)
return -ENOBUFS;
- tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
+ err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
+ if (err < 0)
+ goto errout;
if (!handle) {
handle = fl_grab_new_handle(tp, head);
@@ -585,20 +851,21 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- err = fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- fnew->flags);
- if (err)
- goto errout;
+ if (!tc_skip_hw(fnew->flags)) {
+ err = fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ fnew);
+ if (err)
+ goto errout;
+ }
if (fold) {
- rhashtable_remove_fast(&head->ht, &fold->ht_node,
- head->ht_params);
- fl_hw_destroy_filter(tp, (unsigned long)fold);
+ if (!tc_skip_sw(fold->flags))
+ rhashtable_remove_fast(&head->ht, &fold->ht_node,
+ head->ht_params);
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, fold);
}
*arg = (unsigned long) fnew;
@@ -614,6 +881,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout:
+ tcf_exts_destroy(&fnew->exts);
kfree(fnew);
return err;
}
@@ -623,12 +891,10 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
- rhashtable_remove_fast(&head->ht, &f->ht_node,
- head->ht_params);
- list_del_rcu(&f->list);
- fl_hw_destroy_filter(tp, (unsigned long)f);
- tcf_unbind_filter(tp, &f->res);
- call_rcu(&f->rcu, fl_destroy_filter);
+ if (!tc_skip_sw(f->flags))
+ rhashtable_remove_fast(&head->ht, &f->ht_node,
+ head->ht_params);
+ __fl_delete(tp, f);
return 0;
}
@@ -668,6 +934,65 @@ static int fl_dump_key_val(struct sk_buff *skb,
return 0;
}
+static int fl_dump_key_vlan(struct sk_buff *skb,
+ struct flow_dissector_key_vlan *vlan_key,
+ struct flow_dissector_key_vlan *vlan_mask)
+{
+ int err;
+
+ if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
+ return 0;
+ if (vlan_mask->vlan_id) {
+ err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID,
+ vlan_key->vlan_id);
+ if (err)
+ return err;
+ }
+ if (vlan_mask->vlan_priority) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO,
+ vlan_key->vlan_priority);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask,
+ u32 *flower_key, u32 *flower_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (dissector_mask & dissector_flag_bit) {
+ *flower_mask |= flower_flag_bit;
+ if (dissector_key & dissector_flag_bit)
+ *flower_key |= flower_flag_bit;
+ }
+}
+
+static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
+{
+ u32 key, mask;
+ __be32 _key, _mask;
+ int err;
+
+ if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask)))
+ return 0;
+
+ key = 0;
+ mask = 0;
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+
+ _key = cpu_to_be32(key);
+ _mask = cpu_to_be32(mask);
+
+ err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key);
+ if (err)
+ return err;
+
+ return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
+}
+
static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -700,7 +1025,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
}
- fl_hw_update_stats(tp, f);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -712,6 +1038,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
&mask->basic.n_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.n_proto)))
goto nla_put_failure;
+
+ if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan))
+ goto nla_put_failure;
+
if ((key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) &&
fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
@@ -738,20 +1068,90 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
if (key->basic.ip_proto == IPPROTO_TCP &&
(fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
sizeof(key->tp.src)) ||
fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
sizeof(key->tp.dst))))
goto nla_put_failure;
else if (key->basic.ip_proto == IPPROTO_UDP &&
(fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
sizeof(key->tp.src)) ||
fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
sizeof(key->tp.dst))))
goto nla_put_failure;
+ else if (key->basic.ip_proto == IPPROTO_SCTP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
+ sizeof(key->tp.dst))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6 &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+
+ if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+ (fl_dump_key_val(skb, &key->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ sizeof(key->enc_ipv4.src)) ||
+ fl_dump_key_val(skb, &key->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ sizeof(key->enc_ipv4.dst))))
+ goto nla_put_failure;
+ else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+ (fl_dump_key_val(skb, &key->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ sizeof(key->enc_ipv6.src)) ||
+ fl_dump_key_val(skb, &key->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ &mask->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ sizeof(key->enc_ipv6.dst))))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id, TCA_FLOWER_UNSPEC,
+ sizeof(key->enc_key_id)) ||
+ fl_dump_key_val(skb, &key->enc_tp.src,
+ TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+ &mask->enc_tp.src,
+ TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+ sizeof(key->enc_tp.src)) ||
+ fl_dump_key_val(skb, &key->enc_tp.dst,
+ TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+ &mask->enc_tp.dst,
+ TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+ sizeof(key->enc_tp.dst)))
+ goto nla_put_failure;
+
+ if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
+ goto nla_put_failure;
nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index f23a3b68bba6..9dc63d54e167 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle)
}
static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+ struct tcf_result *res)
{
struct fw_head *head = rcu_dereference_bh(tp->root);
struct fw_filter *f;
@@ -188,17 +188,20 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
static int
fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
- struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
+ struct nlattr **tb, struct nlattr **tca, unsigned long base,
+ bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct tcf_exts e;
u32 mask;
int err;
- tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ err = tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0)
+ goto errout;
if (tb[TCA_FW_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
@@ -235,9 +238,8 @@ errout:
static int fw_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ u32 handle, struct nlattr **tca, unsigned long *arg,
+ bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *) *arg;
@@ -270,10 +272,15 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
- tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ if (err < 0) {
+ kfree(fnew);
+ return err;
+ }
err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr);
if (err < 0) {
+ tcf_exts_destroy(&fnew->exts);
kfree(fnew);
return err;
}
@@ -313,7 +320,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
if (f == NULL)
return -ENOBUFS;
- tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ if (err < 0)
+ goto errout;
f->id = handle;
f->tp = tp;
@@ -328,6 +337,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout:
+ tcf_exts_destroy(&f->exts);
kfree(f);
return err;
}
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 25927b6c4436..b12bc2abea93 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -16,16 +16,11 @@
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
-struct cls_mall_filter {
+struct cls_mall_head {
struct tcf_exts exts;
struct tcf_result res;
u32 handle;
- struct rcu_head rcu;
u32 flags;
-};
-
-struct cls_mall_head {
- struct cls_mall_filter *filter;
struct rcu_head rcu;
};
@@ -33,38 +28,29 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_mall_head *head = rcu_dereference_bh(tp->root);
- struct cls_mall_filter *f = head->filter;
- if (tc_skip_sw(f->flags))
+ if (tc_skip_sw(head->flags))
return -1;
- return tcf_exts_exec(skb, &f->exts, res);
+ return tcf_exts_exec(skb, &head->exts, res);
}
static int mall_init(struct tcf_proto *tp)
{
- struct cls_mall_head *head;
-
- head = kzalloc(sizeof(*head), GFP_KERNEL);
- if (!head)
- return -ENOBUFS;
-
- rcu_assign_pointer(tp->root, head);
-
return 0;
}
-static void mall_destroy_filter(struct rcu_head *head)
+static void mall_destroy_rcu(struct rcu_head *rcu)
{
- struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu);
+ struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
+ rcu);
- tcf_exts_destroy(&f->exts);
-
- kfree(f);
+ tcf_exts_destroy(&head->exts);
+ kfree(head);
}
static int mall_replace_hw_filter(struct tcf_proto *tp,
- struct cls_mall_filter *f,
+ struct cls_mall_head *head,
unsigned long cookie)
{
struct net_device *dev = tp->q->dev_queue->dev;
@@ -74,7 +60,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
offload.type = TC_SETUP_MATCHALL;
offload.cls_mall = &mall_offload;
offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
- offload.cls_mall->exts = &f->exts;
+ offload.cls_mall->exts = &head->exts;
offload.cls_mall->cookie = cookie;
return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
@@ -82,7 +68,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
}
static void mall_destroy_hw_filter(struct tcf_proto *tp,
- struct cls_mall_filter *f,
+ struct cls_mall_head *head,
unsigned long cookie)
{
struct net_device *dev = tp->q->dev_queue->dev;
@@ -103,30 +89,20 @@ static bool mall_destroy(struct tcf_proto *tp, bool force)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
struct net_device *dev = tp->q->dev_queue->dev;
- struct cls_mall_filter *f = head->filter;
- if (!force && f)
- return false;
+ if (!head)
+ return true;
- if (f) {
- if (tc_should_offload(dev, tp, f->flags))
- mall_destroy_hw_filter(tp, f, (unsigned long) f);
+ if (tc_should_offload(dev, tp, head->flags))
+ mall_destroy_hw_filter(tp, head, (unsigned long) head);
- call_rcu(&f->rcu, mall_destroy_filter);
- }
- RCU_INIT_POINTER(tp->root, NULL);
- kfree_rcu(head, rcu);
+ call_rcu(&head->rcu, mall_destroy_rcu);
return true;
}
static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
{
- struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *f = head->filter;
-
- if (f && f->handle == handle)
- return (unsigned long) f;
- return 0;
+ return 0UL;
}
static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
@@ -135,7 +111,7 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
};
static int mall_set_parms(struct net *net, struct tcf_proto *tp,
- struct cls_mall_filter *f,
+ struct cls_mall_head *head,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
@@ -148,11 +124,11 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
return err;
if (tb[TCA_MATCHALL_CLASSID]) {
- f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
- tcf_bind_filter(tp, &f->res, base);
+ head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
+ tcf_bind_filter(tp, &head->res, base);
}
- tcf_exts_change(tp, &f->exts, &e);
+ tcf_exts_change(tp, &head->exts, &e);
return 0;
}
@@ -163,21 +139,17 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
unsigned long *arg, bool ovr)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg;
struct net_device *dev = tp->q->dev_queue->dev;
- struct cls_mall_filter *f;
struct nlattr *tb[TCA_MATCHALL_MAX + 1];
+ struct cls_mall_head *new;
u32 flags = 0;
int err;
if (!tca[TCA_OPTIONS])
return -EINVAL;
- if (head->filter)
- return -EBUSY;
-
- if (fold)
- return -EINVAL;
+ if (head)
+ return -EEXIST;
err = nla_parse_nested(tb, TCA_MATCHALL_MAX,
tca[TCA_OPTIONS], mall_policy);
@@ -190,23 +162,23 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
return -EINVAL;
}
- f = kzalloc(sizeof(*f), GFP_KERNEL);
- if (!f)
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
return -ENOBUFS;
- tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0);
+ tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
if (!handle)
handle = 1;
- f->handle = handle;
- f->flags = flags;
+ new->handle = handle;
+ new->flags = flags;
- err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
+ err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr);
if (err)
goto errout;
if (tc_should_offload(dev, tp, flags)) {
- err = mall_replace_hw_filter(tp, f, (unsigned long) f);
+ err = mall_replace_hw_filter(tp, new, (unsigned long) new);
if (err) {
if (tc_skip_sw(flags))
goto errout;
@@ -215,39 +187,29 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
}
}
- *arg = (unsigned long) f;
- rcu_assign_pointer(head->filter, f);
-
+ *arg = (unsigned long) head;
+ rcu_assign_pointer(tp->root, new);
+ if (head)
+ call_rcu(&head->rcu, mall_destroy_rcu);
return 0;
errout:
- kfree(f);
+ kfree(new);
return err;
}
static int mall_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
- struct net_device *dev = tp->q->dev_queue->dev;
-
- if (tc_should_offload(dev, tp, f->flags))
- mall_destroy_hw_filter(tp, f, (unsigned long) f);
-
- RCU_INIT_POINTER(head->filter, NULL);
- tcf_unbind_filter(tp, &f->res);
- call_rcu(&f->rcu, mall_destroy_filter);
- return 0;
+ return -EOPNOTSUPP;
}
static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *f = head->filter;
if (arg->count < arg->skip)
goto skip;
- if (arg->fn(tp, (unsigned long) f, arg) < 0)
+ if (arg->fn(tp, (unsigned long) head, arg) < 0)
arg->stop = 1;
skip:
arg->count++;
@@ -256,28 +218,28 @@ skip:
static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct cls_mall_filter *f = (struct cls_mall_filter *) fh;
+ struct cls_mall_head *head = (struct cls_mall_head *) fh;
struct nlattr *nest;
- if (!f)
+ if (!head)
return skb->len;
- t->tcm_handle = f->handle;
+ t->tcm_handle = head->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
- if (f->res.classid &&
- nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid))
+ if (head->res.classid &&
+ nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts))
+ if (tcf_exts_dump(skb, &head->exts))
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ if (tcf_exts_dump_stats(skb, &head->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 08a3b0a6f5ab..455fc8f83d0a 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp)
return 0;
}
-static void
-route4_delete_filter(struct rcu_head *head)
+static void route4_delete_filter(struct rcu_head *head)
{
struct route4_filter *f = container_of(head, struct route4_filter, rcu);
@@ -383,17 +382,19 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
struct nlattr **tb, struct nlattr *est, int new,
bool ovr)
{
- int err;
u32 id = 0, to = 0, nhandle = 0x8000;
struct route4_filter *fp;
unsigned int h1;
struct route4_bucket *b;
struct tcf_exts e;
+ int err;
- tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ err = tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ goto errout;
err = -EINVAL;
if (tb[TCA_ROUTE4_TO]) {
@@ -472,10 +473,8 @@ errout:
}
static int route4_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter __rcu **fp;
@@ -503,7 +502,10 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
if (!f)
goto errout;
- tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ if (err < 0)
+ goto errout;
+
if (fold) {
f->id = fold->id;
f->iif = fold->iif;
@@ -557,6 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout:
+ if (f)
+ tcf_exts_destroy(&f->exts);
kfree(f);
return err;
}
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index f9c9fc075fe6..322438fb3ffc 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -152,7 +152,8 @@ static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
return -1;
nhptr = ip_hdr(skb);
#endif
-
+ if (unlikely(!head))
+ return -1;
restart:
#if RSVP_DST_LEN == 4
@@ -487,10 +488,12 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
- tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0)
+ goto errout2;
f = (struct rsvp_filter *)*arg;
if (f) {
@@ -506,7 +509,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
goto errout2;
}
- tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ if (err < 0) {
+ kfree(n);
+ goto errout2;
+ }
if (tb[TCA_RSVP_CLASSID]) {
n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
@@ -530,7 +537,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
if (f == NULL)
goto errout2;
- tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ if (err < 0)
+ goto errout;
h2 = 16;
if (tb[TCA_RSVP_SRC]) {
memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
@@ -627,6 +636,7 @@ insert:
goto insert;
errout:
+ tcf_exts_destroy(&f->exts);
kfree(f);
errout2:
tcf_exts_destroy(&e);
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 944c8ff45055..0751245a6ace 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -50,14 +50,13 @@ struct tcindex_data {
struct rcu_head rcu;
};
-static inline int
-tcindex_filter_is_set(struct tcindex_filter_result *r)
+static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
{
return tcf_exts_is_predicative(&r->exts) || r->res.classid;
}
-static struct tcindex_filter_result *
-tcindex_lookup(struct tcindex_data *p, u16 key)
+static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
+ u16 key)
{
if (p->perfect) {
struct tcindex_filter_result *f = p->perfect + key;
@@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head)
static void tcindex_destroy_fexts(struct rcu_head *head)
{
- struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu);
+ struct tcindex_filter *f = container_of(head, struct tcindex_filter,
+ rcu);
tcf_exts_destroy(&f->result.exts);
kfree(f);
@@ -219,10 +219,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
};
-static void tcindex_filter_result_init(struct tcindex_filter_result *r)
+static int tcindex_filter_result_init(struct tcindex_filter_result *r)
{
memset(r, 0, sizeof(*r));
- tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
}
static void __tcindex_partial_destroy(struct rcu_head *head)
@@ -233,23 +233,57 @@ static void __tcindex_partial_destroy(struct rcu_head *head)
kfree(p);
}
+static void tcindex_free_perfect_hash(struct tcindex_data *cp)
+{
+ int i;
+
+ for (i = 0; i < cp->hash; i++)
+ tcf_exts_destroy(&cp->perfect[i].exts);
+ kfree(cp->perfect);
+}
+
+static int tcindex_alloc_perfect_hash(struct tcindex_data *cp)
+{
+ int i, err = 0;
+
+ cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
+ GFP_KERNEL);
+ if (!cp->perfect)
+ return -ENOMEM;
+
+ for (i = 0; i < cp->hash; i++) {
+ err = tcf_exts_init(&cp->perfect[i].exts,
+ TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ if (err < 0)
+ goto errout;
+ }
+
+ return 0;
+
+errout:
+ tcindex_free_perfect_hash(cp);
+ return err;
+}
+
static int
tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
u32 handle, struct tcindex_data *p,
struct tcindex_filter_result *r, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
- int err, balloc = 0;
struct tcindex_filter_result new_filter_result, *old_r = r;
struct tcindex_filter_result cr;
- struct tcindex_data *cp, *oldp;
+ struct tcindex_data *cp = NULL, *oldp;
struct tcindex_filter *f = NULL; /* make gcc behave */
+ int err, balloc = 0;
struct tcf_exts e;
- tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ goto errout;
err = -ENOMEM;
/* tcindex_data attributes must look atomic to classifier/lookup so
@@ -270,19 +304,20 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (p->perfect) {
int i;
- cp->perfect = kmemdup(p->perfect,
- sizeof(*r) * cp->hash, GFP_KERNEL);
- if (!cp->perfect)
+ if (tcindex_alloc_perfect_hash(cp) < 0)
goto errout;
for (i = 0; i < cp->hash; i++)
- tcf_exts_init(&cp->perfect[i].exts,
- TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ cp->perfect[i].res = p->perfect[i].res;
balloc = 1;
}
cp->h = p->h;
- tcindex_filter_result_init(&new_filter_result);
- tcindex_filter_result_init(&cr);
+ err = tcindex_filter_result_init(&new_filter_result);
+ if (err < 0)
+ goto errout1;
+ err = tcindex_filter_result_init(&cr);
+ if (err < 0)
+ goto errout1;
if (old_r)
cr.res = r->res;
@@ -338,15 +373,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
err = -ENOMEM;
if (!cp->perfect && !cp->h) {
if (valid_perfect_hash(cp)) {
- int i;
-
- cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
- if (!cp->perfect)
+ if (tcindex_alloc_perfect_hash(cp) < 0)
goto errout_alloc;
- for (i = 0; i < cp->hash; i++)
- tcf_exts_init(&cp->perfect[i].exts,
- TCA_TCINDEX_ACT,
- TCA_TCINDEX_POLICE);
balloc = 1;
} else {
struct tcindex_filter __rcu **hash;
@@ -373,8 +401,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (!f)
goto errout_alloc;
f->key = handle;
- tcindex_filter_result_init(&f->result);
f->next = NULL;
+ err = tcindex_filter_result_init(&f->result);
+ if (err < 0) {
+ kfree(f);
+ goto errout_alloc;
+ }
}
if (tb[TCA_TCINDEX_CLASSID]) {
@@ -387,8 +419,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
else
tcf_exts_change(tp, &cr.exts, &e);
- if (old_r && old_r != r)
- tcindex_filter_result_init(old_r);
+ if (old_r && old_r != r) {
+ err = tcindex_filter_result_init(old_r);
+ if (err < 0) {
+ kfree(f);
+ goto errout_alloc;
+ }
+ }
oldp = p;
r->res = cr.res;
@@ -415,9 +452,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
errout_alloc:
if (balloc == 1)
- kfree(cp->perfect);
+ tcindex_free_perfect_hash(cp);
else if (balloc == 2)
kfree(cp->h);
+errout1:
+ tcf_exts_destroy(&cr.exts);
+ tcf_exts_destroy(&new_filter_result.exts);
errout:
kfree(cp);
tcf_exts_destroy(&e);
@@ -503,14 +543,13 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
walker.fn = tcindex_destroy_element;
tcindex_walk(tp, &walker);
- RCU_INIT_POINTER(tp->root, NULL);
call_rcu(&p->rcu, __tcindex_destroy);
return true;
}
static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ffe593efe930..ae83c3aec308 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key,
return h;
}
-static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
@@ -256,8 +257,7 @@ deadloop:
return -1;
}
-static struct tc_u_hnode *
-u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
@@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
return ht;
}
-static struct tc_u_knode *
-u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
unsigned int sel;
struct tc_u_knode *n = NULL;
@@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp)
return 0;
}
-static int u32_destroy_key(struct tcf_proto *tp,
- struct tc_u_knode *n,
+static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
bool free_pf)
{
tcf_exts_destroy(&n->exts);
@@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
}
}
-static int u32_replace_hw_hnode(struct tcf_proto *tp,
- struct tc_u_hnode *h,
- u32 flags)
+static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+ u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
@@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
}
}
-static int u32_replace_hw_knode(struct tcf_proto *tp,
- struct tc_u_knode *n,
- u32 flags)
+static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
@@ -709,13 +705,15 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
struct tc_u_knode *n, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
- int err;
struct tcf_exts e;
+ int err;
- tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ err = tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
if (err < 0)
return err;
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ goto errout;
err = -EINVAL;
if (tb[TCA_U32_LINK]) {
@@ -761,8 +759,7 @@ errout:
return err;
}
-static void u32_replace_knode(struct tcf_proto *tp,
- struct tc_u_common *tp_c,
+static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
struct tc_u_knode *n)
{
struct tc_u_knode __rcu **ins;
@@ -833,15 +830,17 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
new->tp = tp;
memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
- tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
+ kfree(new);
+ return NULL;
+ }
return new;
}
static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
@@ -985,9 +984,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
n->flags = flags;
- tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
n->tp = tp;
+ err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ if (err < 0)
+ goto errout;
+
#ifdef CONFIG_CLS_U32_MARK
n->pcpu_success = alloc_percpu(u32);
if (!n->pcpu_success) {
@@ -1028,9 +1030,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
errhw:
#ifdef CONFIG_CLS_U32_MARK
free_percpu(n->pcpu_success);
-errout:
#endif
+errout:
+ tcf_exts_destroy(&n->exts);
#ifdef CONFIG_CLS_U32_PERF
free_percpu(n->pf);
#endif
@@ -1079,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t)
{
struct tc_u_knode *n = (struct tc_u_knode *)fh;
struct tc_u_hnode *ht_up, *ht_down;
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c66ca9400ab4..c1b23e3060b8 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -57,17 +57,20 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
struct xt_action_param acpar;
const struct xt_set_info *set = (const void *) em->data;
struct net_device *dev, *indev = NULL;
+ struct nf_hook_state state = {
+ .net = em->net,
+ };
int ret, network_offset;
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
- acpar.family = NFPROTO_IPV4;
+ state.pf = NFPROTO_IPV4;
if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
return 0;
acpar.thoff = ip_hdrlen(skb);
break;
case htons(ETH_P_IPV6):
- acpar.family = NFPROTO_IPV6;
+ state.pf = NFPROTO_IPV6;
if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
return 0;
/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
@@ -77,9 +80,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
return 0;
}
- acpar.hooknum = 0;
-
- opt.family = acpar.family;
+ opt.family = state.pf;
opt.dim = set->dim;
opt.flags = set->flags;
opt.cmdflags = 0;
@@ -95,9 +96,9 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
if (skb->skb_iif)
indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
- acpar.net = em->net;
- acpar.in = indev ? indev : dev;
- acpar.out = dev;
+ state.in = indev ? indev : dev;
+ state.out = dev;
+ acpar.state = &state;
ret = ip_set_test(set->index, skb, &acpar, &opt);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a309a07ccb35..41c80b6c3906 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -176,11 +176,12 @@ META_COLLECTOR(int_vlan_tag)
{
unsigned short tag;
- tag = skb_vlan_tag_get(skb);
- if (!tag && __vlan_get_tag(skb, &tag))
- *err = -1;
- else
+ if (skb_vlan_tag_present(skb))
+ dst->value = skb_vlan_tag_get(skb);
+ else if (!__vlan_get_tag(skb, &tag))
dst->value = tag;
+ else
+ *err = -1;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 12ebde845523..d7b93429f0cc 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -29,6 +29,7 @@
#include <linux/hrtimer.h>
#include <linux/lockdep.h>
#include <linux/slab.h>
+#include <linux/hashtable.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -259,37 +260,40 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
{
struct Qdisc *q;
+ if (!qdisc_dev(root))
+ return (root->handle == handle ? root : NULL);
+
if (!(root->flags & TCQ_F_BUILTIN) &&
root->handle == handle)
return root;
- list_for_each_entry_rcu(q, &root->list, list) {
+ hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
if (q->handle == handle)
return q;
}
return NULL;
}
-void qdisc_list_add(struct Qdisc *q)
+void qdisc_hash_add(struct Qdisc *q)
{
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
struct Qdisc *root = qdisc_dev(q)->qdisc;
WARN_ON_ONCE(root == &noop_qdisc);
ASSERT_RTNL();
- list_add_tail_rcu(&q->list, &root->list);
+ hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
}
}
-EXPORT_SYMBOL(qdisc_list_add);
+EXPORT_SYMBOL(qdisc_hash_add);
-void qdisc_list_del(struct Qdisc *q)
+void qdisc_hash_del(struct Qdisc *q)
{
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
ASSERT_RTNL();
- list_del_rcu(&q->list);
+ hash_del_rcu(&q->hash);
}
}
-EXPORT_SYMBOL(qdisc_list_del);
+EXPORT_SYMBOL(qdisc_hash_del);
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
{
@@ -385,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
static struct qdisc_rate_table *qdisc_rtab_list;
-struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
+ struct nlattr *tab)
{
struct qdisc_rate_table *rtab;
@@ -537,7 +542,8 @@ nla_put_failure:
return -1;
}
-void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+ const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -884,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock;
Parameters are passed via opt.
*/
-static struct Qdisc *
-qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
- struct Qdisc *p, u32 parent, u32 handle,
- struct nlattr **tca, int *errp)
+static struct Qdisc *qdisc_create(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ struct Qdisc *p, u32 parent, u32 handle,
+ struct nlattr **tca, int *errp)
{
int err;
struct nlattr *kind = tca[TCA_KIND];
@@ -954,6 +960,17 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
sch->handle = handle;
+ /* This exist to keep backward compatible with a userspace
+ * loophole, what allowed userspace to get IFF_NO_QUEUE
+ * facility on older kernels by setting tx_queue_len=0 (prior
+ * to qdisc init), and then forgot to reinit tx_queue_len
+ * before again attaching a qdisc.
+ */
+ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+ netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
+ }
+
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
if (qdisc_is_percpu_stats(sch)) {
sch->cpu_bstats =
@@ -998,7 +1015,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
goto err_out4;
}
- qdisc_list_add(sch);
+ qdisc_hash_add(sch);
return sch;
}
@@ -1069,7 +1086,8 @@ struct check_loop_arg {
int depth;
};
-static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+static int check_loop_fn(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w);
static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
{
@@ -1377,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
@@ -1431,10 +1449,11 @@ err_out:
static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
struct netlink_callback *cb,
- int *q_idx_p, int s_q_idx)
+ int *q_idx_p, int s_q_idx, bool recur)
{
int ret = 0, q_idx = *q_idx_p;
struct Qdisc *q;
+ int b;
if (!root)
return 0;
@@ -1445,18 +1464,30 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
} else {
if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
}
- list_for_each_entry(q, &root->list, list) {
+
+ /* If dumping singletons, there is no qdisc_dev(root) and the singleton
+ * itself has already been dumped.
+ *
+ * If we've already dumped the top-level (ingress) qdisc above and the global
+ * qdisc hashtable, we don't want to hit it again
+ */
+ if (!qdisc_dev(root) || !recur)
+ goto out;
+
+ hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
if (q_idx < s_q_idx) {
q_idx++;
continue;
}
if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
}
@@ -1490,13 +1521,14 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
s_q_idx = 0;
q_idx = 0;
- if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
+ if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
+ true) < 0)
goto done;
dev_queue = dev_ingress_queue(dev);
if (dev_queue &&
tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
- &q_idx, s_q_idx) < 0)
+ &q_idx, s_q_idx, false) < 0)
goto done;
cont:
@@ -1625,7 +1657,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
if (cops->delete)
err = cops->delete(q, cl);
if (err == 0)
- tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
+ tclass_notify(net, skb, n, q, cl,
+ RTM_DELTCLASS);
goto out;
case RTM_GETTCLASS:
err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
@@ -1723,12 +1756,14 @@ struct qdisc_dump_args {
struct netlink_callback *cb;
};
-static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *arg)
{
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTCLASS);
}
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
@@ -1765,6 +1800,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
int *t_p, int s_t)
{
struct Qdisc *q;
+ int b;
if (!root)
return 0;
@@ -1772,7 +1808,10 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
return -1;
- list_for_each_entry(q, &root->list, list) {
+ if (!qdisc_dev(root))
+ return 0;
+
+ hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
return -1;
}
@@ -1957,10 +1996,12 @@ static int __init pktsched_init(void)
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
+ NULL);
rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
+ NULL);
return 0;
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfb..f1207582cbf3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -122,7 +122,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto __rcu *filter_list;
@@ -509,7 +509,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
if (delay) {
ktime_t time;
- time = ktime_set(0, 0);
+ time = 0;
time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
}
@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
kfree(cl);
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 4002df3c7d9f..5bfa79ee657c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -69,7 +69,7 @@ struct codel_sched_data {
static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
{
struct Qdisc *sch = ctx;
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
if (skb)
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef84..bb4cbdf75004 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
struct Qdisc *qdisc;
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index baeed6a78d28..1e37247656f8 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_drop(skb, sch, to_free);
@@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
unsigned int prev_backlog;
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
prev_backlog = sch->qstats.backlog;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b99e09c..a4f738ac7728 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
@@ -94,6 +95,7 @@ struct fq_sched_data {
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
u32 orphan_mask; /* mask for orphaned skb */
+ u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
@@ -134,7 +136,7 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
struct fq_flow *aux;
parent = *p;
- aux = container_of(parent, struct fq_flow, rate_node);
+ aux = rb_entry(parent, struct fq_flow, rate_node);
if (f->time_next_packet >= aux->time_next_packet)
p = &parent->rb_right;
else
@@ -186,7 +188,7 @@ static void fq_gc(struct fq_sched_data *q,
while (*p) {
parent = *p;
- f = container_of(parent, struct fq_flow, fq_node);
+ f = rb_entry(parent, struct fq_flow, fq_node);
if (f->sk == sk)
break;
@@ -243,7 +245,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
skb_orphan(skb);
}
- root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+ root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
if (q->flows >= (2U << q->fq_trees_log) &&
q->inactive_flows > q->flows/2)
@@ -254,7 +256,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
while (*p) {
parent = *p;
- f = container_of(parent, struct fq_flow, fq_node);
+ f = rb_entry(parent, struct fq_flow, fq_node);
if (f->sk == sk) {
/* socket might have been reallocated, so check
* if its sk_hash is the same.
@@ -407,14 +409,22 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
+ unsigned long sample;
struct rb_node *p;
if (q->time_next_delayed_flow > now)
return;
+ /* Update unthrottle latency EWMA.
+ * This is cheap and can help diagnosing timer/latency problems.
+ */
+ sample = (unsigned long)(now - q->time_next_delayed_flow);
+ q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+ q->unthrottle_latency_ns += sample >> 3;
+
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
- struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+ struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
if (f->time_next_packet > now) {
q->time_next_delayed_flow = f->time_next_packet;
@@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
- u32 rate;
+ u32 rate, plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
@@ -482,7 +492,7 @@ begin:
prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb);
- if (f->credit > 0 || !q->rate_enable)
+ if (!q->rate_enable)
goto out;
/* Do not pace locally generated ack packets */
@@ -493,8 +503,15 @@ begin:
if (skb->sk)
rate = min(skb->sk->sk_pacing_rate, rate);
+ if (rate <= q->low_rate_threshold) {
+ f->credit = 0;
+ plen = qdisc_pkt_len(skb);
+ } else {
+ plen = max(qdisc_pkt_len(skb), q->quantum);
+ if (f->credit > 0)
+ goto out;
+ }
if (rate != ~0U) {
- u32 plen = max(qdisc_pkt_len(skb), q->quantum);
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
@@ -507,7 +524,12 @@ begin:
len = NSEC_PER_SEC;
q->stat_pkts_too_long++;
}
-
+ /* Account for schedule/timers drifts.
+ * f->time_next_packet was set when prior packet was sent,
+ * and current time (@now) can be too late by tens of us.
+ */
+ if (f->time_next_packet)
+ len -= min(len/2, now - f->time_next_packet);
f->time_next_packet = now + len;
}
out:
@@ -541,7 +563,7 @@ static void fq_reset(struct Qdisc *sch)
for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
root = &q->fq_root[idx];
while ((p = rb_first(root)) != NULL) {
- f = container_of(p, struct fq_flow, fq_node);
+ f = rb_entry(p, struct fq_flow, fq_node);
rb_erase(p, root);
fq_flow_purge(f);
@@ -571,20 +593,20 @@ static void fq_rehash(struct fq_sched_data *q,
oroot = &old_array[idx];
while ((op = rb_first(oroot)) != NULL) {
rb_erase(op, oroot);
- of = container_of(op, struct fq_flow, fq_node);
+ of = rb_entry(op, struct fq_flow, fq_node);
if (fq_gc_candidate(of)) {
fcnt++;
kmem_cache_free(fq_flow_cachep, of);
continue;
}
- nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+ nroot = &new_array[hash_ptr(of->sk, new_log)];
np = &nroot->rb_node;
parent = NULL;
while (*np) {
parent = *np;
- nf = container_of(parent, struct fq_flow, fq_node);
+ nf = rb_entry(parent, struct fq_flow, fq_node);
BUG_ON(nf->sk == of->sk);
if (nf->sk > of->sk)
@@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+ q->low_rate_threshold =
+ nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
@@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U;
+ q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
q->old_flows.first = NULL;
@@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
+ q->low_rate_threshold = 550000 / 8;
qdisc_watchdog_init(&q->watchdog, sch);
if (opt)
@@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+ q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -823,20 +854,24 @@ nla_put_failure:
static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_sched_data *q = qdisc_priv(sch);
- u64 now = ktime_get_ns();
- struct tc_fq_qd_stats st = {
- .gc_flows = q->stat_gc_flows,
- .highprio_packets = q->stat_internal_packets,
- .tcp_retrans = q->stat_tcp_retrans,
- .throttled = q->stat_throttled,
- .flows_plimit = q->stat_flows_plimit,
- .pkts_too_long = q->stat_pkts_too_long,
- .allocation_errors = q->stat_allocation_errors,
- .flows = q->flows,
- .inactive_flows = q->inactive_flows,
- .throttled_flows = q->throttled_flows,
- .time_next_delayed_flow = q->time_next_delayed_flow - now,
- };
+ struct tc_fq_qd_stats st;
+
+ sch_tree_lock(sch);
+
+ st.gc_flows = q->stat_gc_flows;
+ st.highprio_packets = q->stat_internal_packets;
+ st.tcp_retrans = q->stat_tcp_retrans;
+ st.throttled = q->stat_throttled;
+ st.flows_plimit = q->stat_flows_plimit;
+ st.pkts_too_long = q->stat_pkts_too_long;
+ st.allocation_errors = q->stat_allocation_errors;
+ st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.flows = q->flows;
+ st.inactive_flows = q->inactive_flows;
+ st.throttled_flows = q->throttled_flows;
+ st.unthrottle_latency_ns = min_t(unsigned long,
+ q->unthrottle_latency_ns, ~0U);
+ sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 657c13362b19..6eb9c8e88519 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -423,7 +423,6 @@ struct Qdisc noop_qdisc = {
.dequeue = noop_dequeue,
.flags = TCQ_F_BUILTIN,
.ops = &noop_qdisc_ops,
- .list = LIST_HEAD_INIT(noop_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
.running = SEQCNT_ZERO(noop_qdisc.running),
@@ -467,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
*/
struct pfifo_fast_priv {
u32 bitmap;
- struct sk_buff_head q[PFIFO_FAST_BANDS];
+ struct qdisc_skb_head q[PFIFO_FAST_BANDS];
};
/*
@@ -478,7 +477,7 @@ struct pfifo_fast_priv {
*/
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
int band)
{
return priv->q + band;
@@ -487,10 +486,10 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+ if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *list = band2list(priv, band);
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
@@ -506,11 +505,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (likely(band >= 0)) {
- struct sk_buff_head *list = band2list(priv, band);
- struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+ struct qdisc_skb_head *qh = band2list(priv, band);
+ struct sk_buff *skb = __qdisc_dequeue_head(qh);
+
+ if (likely(skb != NULL)) {
+ qdisc_qstats_backlog_dec(qdisc, skb);
+ qdisc_bstats_update(qdisc, skb);
+ }
qdisc->q.qlen--;
- if (skb_queue_empty(list))
+ if (qh->qlen == 0)
priv->bitmap &= ~(1 << band);
return skb;
@@ -525,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (band >= 0) {
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *qh = band2list(priv, band);
- return skb_peek(list);
+ return qh->head;
}
return NULL;
@@ -565,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __skb_queue_head_init(band2list(priv, prio));
+ qdisc_skb_head_init(band2list(priv, prio));
/* Can by-pass the queue discipline */
qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -613,8 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
}
- INIT_LIST_HEAD(&sch->list);
- skb_queue_head_init(&sch->q);
+ qdisc_skb_head_init(&sch->q);
+ spin_lock_init(&sch->q.lock);
spin_lock_init(&sch->busylock);
lockdep_set_class(&sch->busylock,
@@ -701,11 +705,11 @@ void qdisc_destroy(struct Qdisc *qdisc)
return;
#ifdef CONFIG_NET_SCHED
- qdisc_list_del(qdisc);
+ qdisc_hash_del(qdisc);
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
- gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ gen_kill_estimator(&qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
@@ -789,6 +793,10 @@ static void attach_default_qdiscs(struct net_device *dev)
qdisc->ops->attach(qdisc);
}
}
+#ifdef CONFIG_NET_SCHED
+ if (dev->qdisc)
+ qdisc_hash_add(dev->qdisc);
+#endif
}
static void transition_one_qdisc(struct net_device *dev,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3ddc7bd74ecb..3ffaa6fb0990 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
@@ -142,8 +142,6 @@ struct hfsc_class {
link-sharing, max(myf, cfmin) */
u64 cl_myf; /* my fit-time (calculated from this
class's own upperlimit curve) */
- u64 cl_myfadj; /* my fit-time adjustment (to cancel
- history dependence) */
u64 cl_cfmin; /* earliest children's fit-time (used
with cl_myf to obtain cl_f) */
u64 cl_cvtmin; /* minimal virtual time among the
@@ -151,11 +149,8 @@ struct hfsc_class {
(monotonic within a period) */
u64 cl_vtadj; /* intra-period cumulative vt
adjustment */
- u64 cl_vtoff; /* inter-period cumulative vt offset */
- u64 cl_cvtmax; /* max child's vt in the last period */
- u64 cl_cvtoff; /* cumulative cvtmax of all periods */
- u64 cl_pcvtoff; /* parent's cvtoff at initialization
- time */
+ u64 cl_cvtoff; /* largest virtual time seen among
+ the children */
struct internal_sc cl_rsc; /* internal real-time service curve */
struct internal_sc cl_fsc; /* internal fair service curve */
@@ -701,28 +696,16 @@ init_vf(struct hfsc_class *cl, unsigned int len)
} else {
/*
* first child for a new parent backlog period.
- * add parent's cvtmax to cvtoff to make a new
- * vt (vtoff + vt) larger than the vt in the
- * last period for all children.
+ * initialize cl_vt to the highest value seen
+ * among the siblings. this is analogous to
+ * what cur_time would provide in realtime case.
*/
- vt = cl->cl_parent->cl_cvtmax;
- cl->cl_parent->cl_cvtoff += vt;
- cl->cl_parent->cl_cvtmax = 0;
+ cl->cl_vt = cl->cl_parent->cl_cvtoff;
cl->cl_parent->cl_cvtmin = 0;
- cl->cl_vt = 0;
}
- cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
- cl->cl_pcvtoff;
-
/* update the virtual curve */
- vt = cl->cl_vt + cl->cl_vtoff;
- rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
- cl->cl_total);
- if (cl->cl_virtual.x == vt) {
- cl->cl_virtual.x -= cl->cl_vtoff;
- cl->cl_vtoff = 0;
- }
+ rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
cl->cl_vtadj = 0;
cl->cl_vtperiod++; /* increment vt period */
@@ -745,7 +728,6 @@ init_vf(struct hfsc_class *cl, unsigned int len)
/* compute myf */
cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
cl->cl_total);
- cl->cl_myfadj = 0;
}
}
@@ -779,8 +761,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
go_passive = 0;
/* update vt */
- cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
- - cl->cl_vtoff + cl->cl_vtadj;
+ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj;
/*
* if vt of the class is smaller than cvtmin,
@@ -795,9 +776,9 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
if (go_passive) {
/* no more active child, going passive */
- /* update cvtmax of the parent class */
- if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
- cl->cl_parent->cl_cvtmax = cl->cl_vt;
+ /* update cvtoff of the parent class */
+ if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
+ cl->cl_parent->cl_cvtoff = cl->cl_vt;
/* remove this class from the vt tree */
vttree_remove(cl);
@@ -813,9 +794,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
/* update f */
if (cl->cl_flags & HFSC_USC) {
+ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total);
+#if 0
cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
cl->cl_total);
-#if 0
/*
* This code causes classes to stay way under their
* limit when multiple classes are used at gigabit
@@ -940,7 +922,7 @@ static void
hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
{
sc2isc(fsc, &cl->cl_fsc);
- rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total);
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
cl->cl_flags |= HFSC_FSC;
}
@@ -1094,7 +1076,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (parent->level == 0)
hfsc_purge_queue(sch, parent);
hfsc_adjust_levels(parent);
- cl->cl_pcvtoff = parent->cl_cvtoff;
sch_tree_unlock(sch);
qdisc_class_hash_grow(sch, &q->clhash);
@@ -1110,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->qdisc);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
}
@@ -1367,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.rtwork = cl->cl_cumul;
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
@@ -1482,16 +1463,12 @@ hfsc_reset_class(struct hfsc_class *cl)
cl->cl_e = 0;
cl->cl_vt = 0;
cl->cl_vtadj = 0;
- cl->cl_vtoff = 0;
cl->cl_cvtmin = 0;
- cl->cl_cvtmax = 0;
cl->cl_cvtoff = 0;
- cl->cl_pcvtoff = 0;
cl->cl_vtperiod = 0;
cl->cl_parentperiod = 0;
cl->cl_f = 0;
cl->cl_myf = 0;
- cl->cl_myfadj = 0;
cl->cl_cfmin = 0;
cl->cl_nactive = 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa187870..760f39e7caee 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -111,7 +111,7 @@ struct htb_class {
unsigned int children;
struct htb_class *parent; /* parent class */
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
/*
* Written often fields
@@ -162,7 +162,7 @@ struct htb_sched {
struct work_struct work;
/* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
+ struct qdisc_skb_head direct_queue;
long direct_pkts;
struct qdisc_watchdog watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
list_del_init(&cl->un.leaf.drop_list);
}
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+ struct qdisc_skb_head *qh)
+{
+ struct sk_buff *last = qh->tail;
+
+ if (last) {
+ skb->next = NULL;
+ last->next = skb;
+ qh->tail = skb;
+ } else {
+ qh->tail = skb;
+ qh->head = skb;
+ }
+ qh->qlen++;
+}
+
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen) {
- __skb_queue_tail(&q->direct_queue, skb);
+ htb_enqueue_tail(skb, sch, &q->direct_queue);
q->direct_pkts++;
} else {
return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
unsigned long start_at;
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
- skb = __skb_dequeue(&q->direct_queue);
+ skb = __qdisc_dequeue_head(&q->direct_queue);
if (skb != NULL) {
ok:
qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
INIT_WORK(&q->work, htb_work_func);
- __skb_queue_head_init(&q->direct_queue);
+ qdisc_skb_head_init(&q->direct_queue);
if (tb[TCA_HTB_DIRECT_QLEN])
q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
@@ -1129,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
@@ -1212,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index b9439827c172..2bc8d7f8df16 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -88,7 +88,7 @@ static void mq_attach(struct Qdisc *sch)
qdisc_destroy(old);
#ifdef CONFIG_NET_SCHED
if (ntx < dev->real_num_tx_queues)
- qdisc_list_add(qdisc);
+ qdisc_hash_add(qdisc);
#endif
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 549c66359924..b5c502c78143 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -182,7 +182,7 @@ static void mqprio_attach(struct Qdisc *sch)
if (old)
qdisc_destroy(old);
if (ntx < dev->real_num_tx_queues)
- qdisc_list_add(qdisc);
+ qdisc_hash_add(qdisc);
}
kfree(priv->qdiscs);
priv->qdiscs = NULL;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index aaaf02175338..bcfadfdea8e0 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -152,7 +152,7 @@ struct netem_skb_cb {
static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
{
- return container_of(rb, struct sk_buff, rbnode);
+ return rb_entry(rb, struct sk_buff, rbnode);
}
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
return segs;
}
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+ skb->next = qh->head;
+
+ if (!qh->head)
+ qh->tail = skb;
+ qh->head = skb;
+ qh->qlen++;
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -502,7 +512,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1<<(prandom_u32() % 8);
}
- if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
@@ -522,8 +532,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (q->rate) {
struct sk_buff *last;
- if (!skb_queue_empty(&sch->q))
- last = skb_peek_tail(&sch->q);
+ if (sch->q.qlen)
+ last = sch->q.tail;
else
last = netem_rb_to_skb(rb_last(&q->t_root));
if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cb->time_to_send = psched_get_time();
q->counter = 0;
- __skb_queue_head(&sch->q, skb);
+ netem_enqueue_skb_head(&sch->q, skb);
sch->qstats.requeues++;
}
@@ -587,7 +597,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct rb_node *p;
tfifo_dequeue:
- skb = __skb_dequeue(&sch->q);
+ skb = __qdisc_dequeue_head(&sch->q);
if (skb) {
qdisc_qstats_backlog_dec(sch, skb);
deliver:
@@ -617,7 +627,7 @@ deliver:
* from the network (tstamp will be updated).
*/
if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
#endif
if (q->qdisc) {
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index a570b0bb254c..5c3a99d6aa82 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
@@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
- skb = __qdisc_dequeue_head(sch, &sch->q);
+ skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f743..f9e712ce2d15 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -508,7 +508,7 @@ set_change_agg:
new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
if (new_agg == NULL) {
err = -ENOBUFS;
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
goto destroy_class;
}
sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
struct qfq_sched *q = qdisc_priv(sch);
qfq_rm_from_agg(q, cl);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 2cd9b4478b92..b0196366d58d 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -418,9 +418,6 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
struct teql_master *m = netdev_priv(dev);
struct Qdisc *q;
- if (new_mtu < 68)
- return -EINVAL;
-
q = m->slaves;
if (q) {
do {
@@ -460,6 +457,8 @@ static __init void teql_master_setup(struct net_device *dev)
dev->netdev_ops = &teql_netdev_ops;
dev->type = ARPHRD_VOID;
dev->mtu = 1500;
+ dev->min_mtu = 68;
+ dev->max_mtu = 65535;
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1c23060c41a6..d3cc30c25c41 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -700,11 +700,15 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* Set the peer's active state. */
peer->state = peer_state;
+ /* Add this peer into the transport hashtable */
+ if (sctp_hash_transport(peer)) {
+ sctp_transport_free(peer);
+ return NULL;
+ }
+
/* Attach the remote transport to our asoc. */
list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
asoc->peer.transport_count++;
- /* Add this peer into the transport hashtable */
- sctp_hash_transport(peer);
/* If we do not yet have a primary path, set one. */
if (!asoc->peer.primary_path) {
@@ -1408,7 +1412,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
transports) {
if (t->pmtu_pending && t->dst) {
sctp_transport_update_pmtu(sk, t,
- WORD_TRUNC(dst_mtu(t->dst)));
+ SCTP_TRUNC4(dst_mtu(t->dst)));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
@@ -1467,7 +1471,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
* threshold. The idea is to recover slowly, but up
* to the initial advertised window.
*/
- if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
+ if (asoc->rwnd_press) {
int change = min(asoc->pathmtu, asoc->rwnd_press);
asoc->rwnd += change;
asoc->rwnd_press -= change;
@@ -1535,7 +1539,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
asoc->rwnd = 0;
}
} else {
- asoc->rwnd_over = len - asoc->rwnd;
+ asoc->rwnd_over += len - asoc->rwnd;
asoc->rwnd = 0;
}
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 912eb1685a5d..f99d4855d3de 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -48,7 +48,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
/* id 2 is reserved as well */
.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
},
-#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE)
+#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
{
.hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
.hmac_name = "hmac(sha256)",
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 401c60750b20..1ebc184a0e23 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -292,6 +292,8 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
}
af->from_addr_param(&addr, rawaddr, htons(port), 0);
+ if (sctp_bind_addr_state(bp, &addr) != -1)
+ goto next;
retval = sctp_add_bind_addr(bp, &addr, sizeof(addr),
SCTP_ADDR_SRC, gfp);
if (retval) {
@@ -300,6 +302,7 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
break;
}
+next:
len = ntohs(param->length);
addrs_len -= len;
raw_addr_list += len;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0a3dbec0a8fb..615f0ddd41df 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -52,7 +52,6 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
atomic_set(&msg->refcnt, 1);
msg->send_failed = 0;
msg->send_error = 0;
- msg->can_abandon = 0;
msg->can_delay = 1;
msg->expires_at = 0;
INIT_LIST_HEAD(&msg->chunks);
@@ -70,6 +69,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
return msg;
}
+void sctp_datamsg_free(struct sctp_datamsg *msg)
+{
+ struct sctp_chunk *chunk;
+
+ /* This doesn't have to be a _safe vairant because
+ * sctp_chunk_free() only drops the refs.
+ */
+ list_for_each_entry(chunk, &msg->chunks, frag_list)
+ sctp_chunk_free(chunk);
+
+ sctp_datamsg_put(msg);
+}
+
/* Final destructruction of datamsg memory. */
static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
{
@@ -169,27 +181,19 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
/* Note: Calculate this outside of the loop, so that all fragments
* have the same expiration.
*/
- if (sinfo->sinfo_timetolive) {
- /* sinfo_timetolive is in milliseconds */
+ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive &&
+ (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
+ !SCTP_PR_POLICY(sinfo->sinfo_flags)))
msg->expires_at = jiffies +
msecs_to_jiffies(sinfo->sinfo_timetolive);
- msg->can_abandon = 1;
-
- pr_debug("%s: msg:%p expires_at:%ld jiffies:%ld\n", __func__,
- msg, msg->expires_at, jiffies);
- }
-
- if (asoc->peer.prsctp_capable &&
- SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
- msg->expires_at =
- jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
- max_data = (asoc->pathmtu -
- sctp_sk(asoc->base.sk)->pf->af->net_header_len -
- sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
+ max_data = asoc->pathmtu -
+ sctp_sk(asoc->base.sk)->pf->af->net_header_len -
+ sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+ max_data = SCTP_TRUNC4(max_data);
max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
@@ -200,8 +204,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
if (hmac_desc)
- max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
- hmac_desc->hmac_len);
+ max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
+ hmac_desc->hmac_len);
}
/* Now, check if we need to reduce our max */
@@ -221,7 +225,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
msg_len > max)
- max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+ max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
@@ -340,18 +344,8 @@ errout:
/* Check whether this message has expired. */
int sctp_chunk_abandoned(struct sctp_chunk *chunk)
{
- if (!chunk->asoc->peer.prsctp_capable ||
- !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
- struct sctp_datamsg *msg = chunk->msg;
-
- if (!msg->can_abandon)
- return 0;
-
- if (time_after(jiffies, msg->expires_at))
- return 1;
-
+ if (!chunk->asoc->peer.prsctp_capable)
return 0;
- }
if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
time_after(jiffies, chunk->msg->expires_at)) {
@@ -364,6 +358,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
return 1;
+ } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
+ chunk->msg->expires_at &&
+ time_after(jiffies, chunk->msg->expires_at)) {
+ return 1;
}
/* PRIO policy is processed by sendmsg, not here */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 1f03065686fe..410ddc1e3443 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -331,7 +331,9 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
* on this endpoint.
*/
if (!ep->base.bind_addr.port)
- goto out;
+ return NULL;
+
+ rcu_read_lock();
t = sctp_epaddr_lookup_transport(ep, paddr);
if (!t)
goto out;
@@ -339,6 +341,7 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
*transport = t;
asoc = t->asoc;
out:
+ rcu_read_unlock();
return asoc;
}
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1555fb8c68e0..458e506ef84b 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -181,9 +181,10 @@ int sctp_rcv(struct sk_buff *skb)
* bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
*/
if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) {
- if (asoc) {
- sctp_association_put(asoc);
+ if (transport) {
+ sctp_transport_put(transport);
asoc = NULL;
+ transport = NULL;
} else {
sctp_endpoint_put(ep);
ep = NULL;
@@ -269,8 +270,8 @@ int sctp_rcv(struct sk_buff *skb)
bh_unlock_sock(sk);
/* Release the asoc/ep ref we took in the lookup calls. */
- if (asoc)
- sctp_association_put(asoc);
+ if (transport)
+ sctp_transport_put(transport);
else
sctp_endpoint_put(ep);
@@ -283,8 +284,8 @@ discard_it:
discard_release:
/* Release the asoc/ep ref we took in the lookup calls. */
- if (asoc)
- sctp_association_put(asoc);
+ if (transport)
+ sctp_transport_put(transport);
else
sctp_endpoint_put(ep);
@@ -300,6 +301,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
+ struct sctp_transport *t = chunk->transport;
struct sctp_ep_common *rcvr = NULL;
int backloged = 0;
@@ -351,7 +353,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
done:
/* Release the refs we took in sctp_add_backlog */
if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
- sctp_association_put(sctp_assoc(rcvr));
+ sctp_transport_put(t);
else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
sctp_endpoint_put(sctp_ep(rcvr));
else
@@ -363,6 +365,7 @@ done:
static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+ struct sctp_transport *t = chunk->transport;
struct sctp_ep_common *rcvr = chunk->rcvr;
int ret;
@@ -373,7 +376,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb)
* from us
*/
if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
- sctp_association_hold(sctp_assoc(rcvr));
+ sctp_transport_hold(t);
else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
sctp_endpoint_hold(sctp_ep(rcvr));
else
@@ -537,15 +540,15 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
return sk;
out:
- sctp_association_put(asoc);
+ sctp_transport_put(transport);
return NULL;
}
/* Common cleanup code for icmp/icmpv6 error handler. */
-void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
+void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
{
bh_unlock_sock(sk);
- sctp_association_put(asoc);
+ sctp_transport_put(t);
}
/*
@@ -605,7 +608,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
/* PMTU discovery (RFC1191) */
if (ICMP_FRAG_NEEDED == code) {
sctp_icmp_frag_needed(sk, asoc, transport,
- WORD_TRUNC(info));
+ SCTP_TRUNC4(info));
goto out_unlock;
} else {
if (ICMP_PROT_UNREACH == code) {
@@ -641,7 +644,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
}
out_unlock:
- sctp_err_finish(sk, asoc);
+ sctp_err_finish(sk, transport);
}
/*
@@ -673,7 +676,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = offset + WORD_ROUND(ntohs(ch->length));
+ ch_end = offset + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb->len)
break;
@@ -787,10 +790,9 @@ hit:
/* rhashtable for transport */
struct sctp_hash_cmp_arg {
- const struct sctp_endpoint *ep;
- const union sctp_addr *laddr;
- const union sctp_addr *paddr;
- const struct net *net;
+ const union sctp_addr *paddr;
+ const struct net *net;
+ u16 lport;
};
static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -798,7 +800,6 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
{
struct sctp_transport *t = (struct sctp_transport *)ptr;
const struct sctp_hash_cmp_arg *x = arg->key;
- struct sctp_association *asoc;
int err = 1;
if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
@@ -806,19 +807,10 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
if (!sctp_transport_hold(t))
return err;
- asoc = t->asoc;
- if (!net_eq(sock_net(asoc->base.sk), x->net))
+ if (!net_eq(sock_net(t->asoc->base.sk), x->net))
+ goto out;
+ if (x->lport != htons(t->asoc->base.bind_addr.port))
goto out;
- if (x->ep) {
- if (x->ep != asoc->ep)
- goto out;
- } else {
- if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
- goto out;
- if (!sctp_bind_addr_match(&asoc->base.bind_addr,
- x->laddr, sctp_sk(asoc->base.sk)))
- goto out;
- }
err = 0;
out:
@@ -848,11 +840,9 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
const struct sctp_hash_cmp_arg *x = data;
const union sctp_addr *paddr = x->paddr;
const struct net *net = x->net;
- u16 lport;
+ u16 lport = x->lport;
u32 addr;
- lport = x->ep ? htons(x->ep->base.bind_addr.port) :
- x->laddr->v4.sin_port;
if (paddr->sa.sa_family == AF_INET6)
addr = jhash(&paddr->v6.sin6_addr, 16, seed);
else
@@ -872,29 +862,32 @@ static const struct rhashtable_params sctp_hash_params = {
int sctp_transport_hashtable_init(void)
{
- return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params);
+ return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
}
void sctp_transport_hashtable_destroy(void)
{
- rhashtable_destroy(&sctp_transport_hashtable);
+ rhltable_destroy(&sctp_transport_hashtable);
}
-void sctp_hash_transport(struct sctp_transport *t)
+int sctp_hash_transport(struct sctp_transport *t)
{
struct sctp_hash_cmp_arg arg;
+ int err;
if (t->asoc->temp)
- return;
+ return 0;
- arg.ep = t->asoc->ep;
- arg.paddr = &t->ipaddr;
arg.net = sock_net(t->asoc->base.sk);
+ arg.paddr = &t->ipaddr;
+ arg.lport = htons(t->asoc->base.bind_addr.port);
+
+ err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
+ &t->node, sctp_hash_params);
+ if (err)
+ pr_err_once("insert transport fail, errno %d\n", err);
-reinsert:
- if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
- &t->node, sctp_hash_params) == -EBUSY)
- goto reinsert;
+ return err;
}
void sctp_unhash_transport(struct sctp_transport *t)
@@ -902,39 +895,62 @@ void sctp_unhash_transport(struct sctp_transport *t)
if (t->asoc->temp)
return;
- rhashtable_remove_fast(&sctp_transport_hashtable, &t->node,
- sctp_hash_params);
+ rhltable_remove(&sctp_transport_hashtable, &t->node,
+ sctp_hash_params);
}
+/* return a transport with holding it */
struct sctp_transport *sctp_addrs_lookup_transport(
struct net *net,
const union sctp_addr *laddr,
const union sctp_addr *paddr)
{
+ struct rhlist_head *tmp, *list;
+ struct sctp_transport *t;
struct sctp_hash_cmp_arg arg = {
- .ep = NULL,
- .laddr = laddr,
.paddr = paddr,
.net = net,
+ .lport = laddr->v4.sin_port,
};
- return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
- sctp_hash_params);
+ list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
+
+ rhl_for_each_entry_rcu(t, tmp, list, node) {
+ if (!sctp_transport_hold(t))
+ continue;
+
+ if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
+ laddr, sctp_sk(t->asoc->base.sk)))
+ return t;
+ sctp_transport_put(t);
+ }
+
+ return NULL;
}
+/* return a transport without holding it, as it's only used under sock lock */
struct sctp_transport *sctp_epaddr_lookup_transport(
const struct sctp_endpoint *ep,
const union sctp_addr *paddr)
{
struct net *net = sock_net(ep->base.sk);
+ struct rhlist_head *tmp, *list;
+ struct sctp_transport *t;
struct sctp_hash_cmp_arg arg = {
- .ep = ep,
.paddr = paddr,
.net = net,
+ .lport = htons(ep->base.bind_addr.port),
};
- return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
- sctp_hash_params);
+ list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
+
+ rhl_for_each_entry_rcu(t, tmp, list, node)
+ if (ep == t->asoc->ep)
+ return t;
+
+ return NULL;
}
/* Look up an association. */
@@ -948,15 +964,12 @@ static struct sctp_association *__sctp_lookup_association(
struct sctp_association *asoc = NULL;
t = sctp_addrs_lookup_transport(net, local, peer);
- if (!t || !sctp_transport_hold(t))
+ if (!t)
goto out;
asoc = t->asoc;
- sctp_association_hold(asoc);
*pt = t;
- sctp_transport_put(t);
-
out:
return asoc;
}
@@ -986,7 +999,7 @@ int sctp_has_association(struct net *net,
struct sctp_transport *transport;
if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) {
- sctp_association_put(asoc);
+ sctp_transport_put(transport);
return 1;
}
@@ -1021,7 +1034,6 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
struct sctphdr *sh = sctp_hdr(skb);
union sctp_params params;
sctp_init_chunk_t *init;
- struct sctp_transport *transport;
struct sctp_af *af;
/*
@@ -1052,7 +1064,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
af->from_addr_param(paddr, params.addr, sh->source, 0);
- asoc = __sctp_lookup_association(net, laddr, paddr, &transport);
+ asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
if (asoc)
return asoc;
}
@@ -1128,7 +1140,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb_tail_pointer(skb))
break;
@@ -1197,7 +1209,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
* that the chunk length doesn't cause overflow. Otherwise, we'll
* walk off the end.
*/
- if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+ if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
return NULL;
/* If this is INIT/INIT-ACK look inside the chunk too. */
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 6437aa97cfd7..f731de3e8428 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -213,7 +213,7 @@ new_skb:
}
chunk->chunk_hdr = ch;
- chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
chunk->subh.v = NULL; /* Subheader is no longer valid. */
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f473779e8b1c..64dfd35ccdcc 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -71,7 +71,7 @@
#include <net/inet_ecn.h>
#include <net/sctp/sctp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
union sctp_addr *s2);
@@ -198,7 +198,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
out_unlock:
- sctp_err_finish(sk, asoc);
+ sctp_err_finish(sk, transport);
out:
if (likely(idev != NULL))
in6_dev_put(idev);
@@ -222,7 +222,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
rcu_read_lock();
- res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass);
+ res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
+ np->tclass);
rcu_read_unlock();
return res;
}
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 7e869d0cca69..4f5a2b580aa5 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
goto out;
}
- segs = skb_segment(skb, features | NETIF_F_HW_CSUM);
+ segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG);
if (IS_ERR(segs))
goto out;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 31b7bc35895d..f5320a87341e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -180,7 +180,6 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
int one_packet, gfp_t gfp)
{
sctp_xmit_t retval;
- int error = 0;
pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
@@ -188,6 +187,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
case SCTP_XMIT_PMTU_FULL:
if (!packet->has_cookie_echo) {
+ int error = 0;
+
error = sctp_packet_transmit(packet, gfp);
if (error < 0)
chunk->skb->sk->sk_err = -error;
@@ -296,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
struct sctp_chunk *chunk)
{
sctp_xmit_t retval = SCTP_XMIT_OK;
- __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+ __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
/* Check to see if this chunk will fit into the packet */
retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -398,176 +399,72 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
atomic_inc(&sk->sk_wmem_alloc);
}
-/* All packets are sent to the network through this function from
- * sctp_outq_tail().
- *
- * The return value is a normal kernel error return value.
- */
-int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
+static int sctp_packet_pack(struct sctp_packet *packet,
+ struct sk_buff *head, int gso, gfp_t gfp)
{
struct sctp_transport *tp = packet->transport;
- struct sctp_association *asoc = tp->asoc;
- struct sctphdr *sh;
- struct sk_buff *nskb = NULL, *head = NULL;
+ struct sctp_auth_chunk *auth = NULL;
struct sctp_chunk *chunk, *tmp;
- struct sock *sk;
- int err = 0;
- int padding; /* How much padding do we need? */
- int pkt_size;
- __u8 has_data = 0;
- int gso = 0;
- int pktcount = 0;
- struct dst_entry *dst;
- unsigned char *auth = NULL; /* pointer to auth in skb data */
+ int pkt_count = 0, pkt_size;
+ struct sock *sk = head->sk;
+ struct sk_buff *nskb;
+ int auth_len = 0;
- pr_debug("%s: packet:%p\n", __func__, packet);
-
- /* Do NOT generate a chunkless packet. */
- if (list_empty(&packet->chunk_list))
- return err;
-
- /* Set up convenience variables... */
- chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
- sk = chunk->skb->sk;
-
- /* Allocate the head skb, or main one if not in GSO */
- if (packet->size > tp->pathmtu && !packet->ipfragok) {
- if (sk_can_gso(sk)) {
- gso = 1;
- pkt_size = packet->overhead;
- } else {
- /* If this happens, we trash this packet and try
- * to build a new one, hopefully correct this
- * time. Application may notice this error.
- */
- pr_err_once("Trying to GSO but underlying device doesn't support it.");
- goto nomem;
- }
- } else {
- pkt_size = packet->size;
- }
- head = alloc_skb(pkt_size + MAX_HEADER, gfp);
- if (!head)
- goto nomem;
if (gso) {
- NAPI_GRO_CB(head)->last = head;
skb_shinfo(head)->gso_type = sk->sk_gso_type;
+ NAPI_GRO_CB(head)->last = head;
+ } else {
+ nskb = head;
+ pkt_size = packet->size;
+ goto merge;
}
- /* Make sure the outbound skb has enough header room reserved. */
- skb_reserve(head, packet->overhead + MAX_HEADER);
-
- /* Set the owning socket so that we know where to get the
- * destination IP address.
- */
- sctp_packet_set_owner_w(head, sk);
-
- if (!sctp_transport_dst_check(tp)) {
- sctp_transport_route(tp, NULL, sctp_sk(sk));
- if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
- sctp_assoc_sync_pmtu(sk, asoc);
- }
- }
- dst = dst_clone(tp->dst);
- if (!dst)
- goto no_route;
- skb_dst_set(head, dst);
-
- /* Build the SCTP header. */
- sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
- skb_reset_transport_header(head);
- sh->source = htons(packet->source_port);
- sh->dest = htons(packet->destination_port);
-
- /* From 6.8 Adler-32 Checksum Calculation:
- * After the packet is constructed (containing the SCTP common
- * header and one or more control or DATA chunks), the
- * transmitter shall:
- *
- * 1) Fill in the proper Verification Tag in the SCTP common
- * header and initialize the checksum field to 0's.
- */
- sh->vtag = htonl(packet->vtag);
- sh->checksum = 0;
-
- pr_debug("***sctp_transmit_packet***\n");
-
do {
- /* Set up convenience variables... */
- chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
- pktcount++;
-
- /* Calculate packet size, so it fits in PMTU. Leave
- * other chunks for the next packets.
- */
- if (gso) {
- pkt_size = packet->overhead;
- list_for_each_entry(chunk, &packet->chunk_list, list) {
- int padded = WORD_ROUND(chunk->skb->len);
-
- if (pkt_size + padded > tp->pathmtu)
- break;
- pkt_size += padded;
- }
+ /* calculate the pkt_size and alloc nskb */
+ pkt_size = packet->overhead;
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list,
+ list) {
+ int padded = SCTP_PAD4(chunk->skb->len);
- /* Allocate a new skb. */
- nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
- if (!nskb)
- goto nomem;
-
- /* Make sure the outbound skb has enough header
- * room reserved.
- */
- skb_reserve(nskb, packet->overhead + MAX_HEADER);
- } else {
- nskb = head;
+ if (chunk == packet->auth)
+ auth_len = padded;
+ else if (auth_len + padded + packet->overhead >
+ tp->pathmtu)
+ return 0;
+ else if (pkt_size + padded > tp->pathmtu)
+ break;
+ pkt_size += padded;
}
+ nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
+ if (!nskb)
+ return 0;
+ skb_reserve(nskb, packet->overhead + MAX_HEADER);
- /**
- * 3.2 Chunk Field Descriptions
- *
- * The total length of a chunk (including Type, Length and
- * Value fields) MUST be a multiple of 4 bytes. If the length
- * of the chunk is not a multiple of 4 bytes, the sender MUST
- * pad the chunk with all zero bytes and this padding is not
- * included in the chunk length field. The sender should
- * never pad with more than 3 bytes.
- *
- * [This whole comment explains WORD_ROUND() below.]
- */
-
+merge:
+ /* merge chunks into nskb and append nskb into head list */
pkt_size -= packet->overhead;
list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ int padding;
+
list_del_init(&chunk->list);
if (sctp_chunk_is_data(chunk)) {
- /* 6.3.1 C4) When data is in flight and when allowed
- * by rule C5, a new RTT measurement MUST be made each
- * round trip. Furthermore, new RTT measurements
- * SHOULD be made no more than once per round-trip
- * for a given destination transport address.
- */
-
- if (!chunk->resent && !tp->rto_pending) {
+ if (!sctp_chunk_retransmitted(chunk) &&
+ !tp->rto_pending) {
chunk->rtt_in_progress = 1;
tp->rto_pending = 1;
}
-
- has_data = 1;
}
- padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+ padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
if (padding)
memset(skb_put(chunk->skb, padding), 0, padding);
- /* if this is the auth chunk that we are adding,
- * store pointer where it will be added and put
- * the auth into the packet.
- */
if (chunk == packet->auth)
- auth = skb_tail_pointer(nskb);
+ auth = (struct sctp_auth_chunk *)
+ skb_tail_pointer(nskb);
- memcpy(skb_put(nskb, chunk->skb->len),
- chunk->skb->data, chunk->skb->len);
+ memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data,
+ chunk->skb->len);
pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
chunk,
@@ -577,12 +474,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
ntohs(chunk->chunk_hdr->length), chunk->skb->len,
chunk->rtt_in_progress);
- /* If this is a control chunk, this is our last
- * reference. Free data chunks after they've been
- * acknowledged or have failed.
- * Re-queue auth chunks if needed.
- */
- pkt_size -= WORD_ROUND(chunk->skb->len);
+ pkt_size -= SCTP_PAD4(chunk->skb->len);
if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
sctp_chunk_free(chunk);
@@ -591,167 +483,161 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
break;
}
- /* SCTP-AUTH, Section 6.2
- * The sender MUST calculate the MAC as described in RFC2104 [2]
- * using the hash function H as described by the MAC Identifier and
- * the shared association key K based on the endpoint pair shared key
- * described by the shared key identifier. The 'data' used for the
- * computation of the AUTH-chunk is given by the AUTH chunk with its
- * HMAC field set to zero (as shown in Figure 6) followed by all
- * chunks that are placed after the AUTH chunk in the SCTP packet.
- */
- if (auth)
- sctp_auth_calculate_hmac(asoc, nskb,
- (struct sctp_auth_chunk *)auth,
- gfp);
-
- if (packet->auth) {
- if (!list_empty(&packet->chunk_list)) {
- /* We will generate more packets, so re-queue
- * auth chunk.
- */
+ if (auth) {
+ sctp_auth_calculate_hmac(tp->asoc, nskb, auth, gfp);
+ /* free auth if no more chunks, or add it back */
+ if (list_empty(&packet->chunk_list))
+ sctp_chunk_free(packet->auth);
+ else
list_add(&packet->auth->list,
&packet->chunk_list);
- } else {
- sctp_chunk_free(packet->auth);
- packet->auth = NULL;
- }
}
- if (!gso)
- break;
+ if (gso) {
+ if (skb_gro_receive(&head, nskb)) {
+ kfree_skb(nskb);
+ return 0;
+ }
+ if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
+ sk->sk_gso_max_segs))
+ return 0;
+ }
- if (skb_gro_receive(&head, nskb))
- goto nomem;
- nskb = NULL;
- if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
- sk->sk_gso_max_segs))
- goto nomem;
+ pkt_count++;
} while (!list_empty(&packet->chunk_list));
- /* 2) Calculate the Adler-32 checksum of the whole packet,
- * including the SCTP common header and all the
- * chunks.
- *
- * Note: Adler-32 is no longer applicable, as has been replaced
- * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
- *
- * If it's a GSO packet, it's postponed to sctp_skb_segment.
- */
- if (!sctp_checksum_disable || gso) {
- if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
- dst_xfrm(dst) || packet->ipfragok)) {
- sh->checksum = sctp_compute_cksum(head, 0);
- } else {
- /* no need to seed pseudo checksum for SCTP */
- head->ip_summed = CHECKSUM_PARTIAL;
- head->csum_start = skb_transport_header(head) - head->head;
- head->csum_offset = offsetof(struct sctphdr, checksum);
+ if (gso) {
+ memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+ skb_shinfo(head)->gso_segs = pkt_count;
+ skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+ rcu_read_lock();
+ if (skb_dst(head) != tp->dst) {
+ dst_hold(tp->dst);
+ sk_setup_caps(sk, tp->dst);
}
+ rcu_read_unlock();
+ goto chksum;
}
- /* IP layer ECN support
- * From RFC 2481
- * "The ECN-Capable Transport (ECT) bit would be set by the
- * data sender to indicate that the end-points of the
- * transport protocol are ECN-capable."
- *
- * Now setting the ECT bit all the time, as it should not cause
- * any problems protocol-wise even if our peer ignores it.
- *
- * Note: The works for IPv6 layer checks this bit too later
- * in transmission. See IP6_ECN_flow_xmit().
- */
- tp->af_specific->ecn_capable(sk);
+ if (sctp_checksum_disable)
+ return 1;
- /* Set up the IP options. */
- /* BUG: not implemented
- * For v4 this all lives somewhere in sk->sk_opt...
- */
+ if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) ||
+ dst_xfrm(skb_dst(head)) || packet->ipfragok) {
+ struct sctphdr *sh =
+ (struct sctphdr *)skb_transport_header(head);
- /* Dump that on IP! */
- if (asoc) {
- asoc->stats.opackets += pktcount;
- if (asoc->peer.last_sent_to != tp)
- /* Considering the multiple CPU scenario, this is a
- * "correcter" place for last_sent_to. --xguo
- */
- asoc->peer.last_sent_to = tp;
+ sh->checksum = sctp_compute_cksum(head, 0);
+ } else {
+chksum:
+ head->ip_summed = CHECKSUM_PARTIAL;
+ head->csum_start = skb_transport_header(head) - head->head;
+ head->csum_offset = offsetof(struct sctphdr, checksum);
}
- if (has_data) {
- struct timer_list *timer;
- unsigned long timeout;
+ return pkt_count;
+}
- /* Restart the AUTOCLOSE timer when sending data. */
- if (sctp_state(asoc, ESTABLISHED) &&
- asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
- timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
- timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+/* All packets are sent to the network through this function from
+ * sctp_outq_tail().
+ *
+ * The return value is always 0 for now.
+ */
+int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
+{
+ struct sctp_transport *tp = packet->transport;
+ struct sctp_association *asoc = tp->asoc;
+ struct sctp_chunk *chunk, *tmp;
+ int pkt_count, gso = 0;
+ struct dst_entry *dst;
+ struct sk_buff *head;
+ struct sctphdr *sh;
+ struct sock *sk;
+
+ pr_debug("%s: packet:%p\n", __func__, packet);
+ if (list_empty(&packet->chunk_list))
+ return 0;
+ chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+ sk = chunk->skb->sk;
- if (!mod_timer(timer, jiffies + timeout))
- sctp_association_hold(asoc);
+ /* check gso */
+ if (packet->size > tp->pathmtu && !packet->ipfragok) {
+ if (!sk_can_gso(sk)) {
+ pr_err_once("Trying to GSO but underlying device doesn't support it.");
+ goto out;
}
+ gso = 1;
}
+ /* alloc head skb */
+ head = alloc_skb((gso ? packet->overhead : packet->size) +
+ MAX_HEADER, gfp);
+ if (!head)
+ goto out;
+ skb_reserve(head, packet->overhead + MAX_HEADER);
+ sctp_packet_set_owner_w(head, sk);
+
+ /* set sctp header */
+ sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
+ skb_reset_transport_header(head);
+ sh->source = htons(packet->source_port);
+ sh->dest = htons(packet->destination_port);
+ sh->vtag = htonl(packet->vtag);
+ sh->checksum = 0;
+
+ /* update dst if in need */
+ if (!sctp_transport_dst_check(tp)) {
+ sctp_transport_route(tp, NULL, sctp_sk(sk));
+ if (asoc && asoc->param_flags & SPP_PMTUD_ENABLE)
+ sctp_assoc_sync_pmtu(sk, asoc);
+ }
+ dst = dst_clone(tp->dst);
+ if (!dst) {
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(head);
+ goto out;
+ }
+ skb_dst_set(head, dst);
+
+ /* pack up chunks */
+ pkt_count = sctp_packet_pack(packet, head, gso, gfp);
+ if (!pkt_count) {
+ kfree_skb(head);
+ goto out;
+ }
pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);
- if (gso) {
- /* Cleanup our debris for IP stacks */
- memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
- sizeof(struct inet6_skb_parm)));
+ /* start autoclose timer */
+ if (packet->has_data && sctp_state(asoc, ESTABLISHED) &&
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
+ struct timer_list *timer =
+ &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+ unsigned long timeout =
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
- skb_shinfo(head)->gso_segs = pktcount;
- skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+ if (!mod_timer(timer, jiffies + timeout))
+ sctp_association_hold(asoc);
+ }
- /* We have to refresh this in case we are xmiting to
- * more than one transport at a time
- */
- rcu_read_lock();
- if (__sk_dst_get(sk) != tp->dst) {
- dst_hold(tp->dst);
- sk_setup_caps(sk, tp->dst);
- }
- rcu_read_unlock();
+ /* sctp xmit */
+ tp->af_specific->ecn_capable(sk);
+ if (asoc) {
+ asoc->stats.opackets += pkt_count;
+ if (asoc->peer.last_sent_to != tp)
+ asoc->peer.last_sent_to = tp;
}
head->ignore_df = packet->ipfragok;
tp->af_specific->sctp_xmit(head, tp);
out:
- sctp_packet_reset(packet);
- return err;
-no_route:
- kfree_skb(head);
- if (nskb != head)
- kfree_skb(nskb);
-
- if (asoc)
- IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
-
- /* FIXME: Returning the 'err' will effect all the associations
- * associated with a socket, although only one of the paths of the
- * association is unreachable.
- * The real failure of a transport or association can be passed on
- * to the user via notifications. So setting this error may not be
- * required.
- */
- /* err = -EHOSTUNREACH; */
-err:
- /* Control chunks are unreliable so just drop them. DATA chunks
- * will get resent or dropped later.
- */
-
list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
list_del_init(&chunk->list);
if (!sctp_chunk_is_data(chunk))
sctp_chunk_free(chunk);
}
- goto out;
-nomem:
- if (packet->auth && list_empty(&packet->auth->list))
- sctp_chunk_free(packet->auth);
- err = -ENOMEM;
- goto err;
+ sctp_packet_reset(packet);
+ return 0;
}
/********************************************************************
@@ -867,9 +753,6 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
rwnd = 0;
asoc->peer.rwnd = rwnd;
- /* Has been accepted for transmission. */
- if (!asoc->peer.prsctp_capable)
- chunk->msg->can_abandon = 0;
sctp_chunk_assign_tsn(chunk);
sctp_chunk_assign_ssn(chunk);
}
@@ -913,7 +796,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
*/
maxsize = pmtu - packet->overhead;
if (packet->auth)
- maxsize -= WORD_ROUND(packet->auth->skb->len);
+ maxsize -= SCTP_PAD4(packet->auth->skb->len);
if (chunk_len > maxsize)
retval = SCTP_XMIT_PMTU_FULL;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 107233da5cc9..34efaa4ef2f6 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
/* Add data to the front of the queue. */
static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q)
}
/* Put a new chunk in an sctp_outq. */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
+void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
{
struct net *net = sock_net(q->asoc->base.sk);
- int error = 0;
pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
chunk && chunk->chunk_hdr ?
@@ -299,54 +298,26 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
* immediately.
*/
if (sctp_chunk_is_data(chunk)) {
- /* Is it OK to queue data chunks? */
- /* From 9. Termination of Association
- *
- * When either endpoint performs a shutdown, the
- * association on each peer will stop accepting new
- * data from its user and only deliver data in queue
- * at the time of sending or receiving the SHUTDOWN
- * chunk.
- */
- switch (q->asoc->state) {
- case SCTP_STATE_CLOSED:
- case SCTP_STATE_SHUTDOWN_PENDING:
- case SCTP_STATE_SHUTDOWN_SENT:
- case SCTP_STATE_SHUTDOWN_RECEIVED:
- case SCTP_STATE_SHUTDOWN_ACK_SENT:
- /* Cannot send after transport endpoint shutdown */
- error = -ESHUTDOWN;
- break;
-
- default:
- pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
- __func__, q, chunk, chunk && chunk->chunk_hdr ?
- sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
- "illegal chunk");
-
- sctp_chunk_hold(chunk);
- sctp_outq_tail_data(q, chunk);
- if (chunk->asoc->peer.prsctp_capable &&
- SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
- chunk->asoc->sent_cnt_removable++;
- if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
- SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
- else
- SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
- break;
- }
+ pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
+ __func__, q, chunk, chunk && chunk->chunk_hdr ?
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+ "illegal chunk");
+
+ sctp_outq_tail_data(q, chunk);
+ if (chunk->asoc->peer.prsctp_capable &&
+ SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+ chunk->asoc->sent_cnt_removable++;
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
+ else
+ SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
} else {
list_add_tail(&chunk->list, &q->control_chunk_list);
SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
}
- if (error < 0)
- return error;
-
if (!q->cork)
- error = sctp_outq_flush(q, 0, gfp);
-
- return error;
+ sctp_outq_flush(q, 0, gfp);
}
/* Insert a chunk into the sorted list based on the TSNs. The retransmit list
@@ -536,8 +507,6 @@ void sctp_retransmit_mark(struct sctp_outq *q,
transport->rto_pending = 0;
}
- chunk->resent = 1;
-
/* Move the chunk to the retransmit queue. The chunks
* on the retransmit queue are always kept in order.
*/
@@ -559,7 +528,6 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
sctp_retransmit_reason_t reason)
{
struct net *net = sock_net(q->asoc->base.sk);
- int error = 0;
switch (reason) {
case SCTP_RTXR_T3_RTX:
@@ -603,10 +571,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
* will be flushed at the end.
*/
if (reason != SCTP_RTXR_FAST_RTX)
- error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
-
- if (error)
- q->asoc->base.sk->sk_err = -error;
+ sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
}
/*
@@ -778,12 +743,12 @@ redo:
}
/* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
+void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
{
if (q->cork)
q->cork = 0;
- return sctp_outq_flush(q, 0, gfp);
+ sctp_outq_flush(q, 0, gfp);
}
@@ -796,7 +761,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
* locking concerns must be made. Today we use the sock lock to protect
* this function.
*/
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
{
struct sctp_packet *packet;
struct sctp_packet singleton;
@@ -919,8 +884,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
sctp_packet_config(&singleton, vtag, 0);
sctp_packet_append_chunk(&singleton, chunk);
error = sctp_packet_transmit(&singleton, gfp);
- if (error < 0)
- return error;
+ if (error < 0) {
+ asoc->base.sk->sk_err = -error;
+ return;
+ }
break;
case SCTP_CID_ABORT:
@@ -1018,6 +985,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
retran:
error = sctp_outq_flush_rtx(q, packet,
rtx_timeout, &start_timer);
+ if (error < 0)
+ asoc->base.sk->sk_err = -error;
if (start_timer) {
sctp_transport_reset_t3_rtx(transport);
@@ -1079,7 +1048,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
(new_transport->state == SCTP_PF)))
new_transport = asoc->peer.active_path;
if (new_transport->state == SCTP_UNCONFIRMED) {
- WARN_ONCE(1, "Atempt to send packet on unconfirmed path.");
+ WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
sctp_chunk_fail(chunk, 0);
sctp_chunk_free(chunk);
continue;
@@ -1192,14 +1161,15 @@ sctp_flush_out:
struct sctp_transport,
send_ready);
packet = &t->packet;
- if (!sctp_packet_empty(packet))
+ if (!sctp_packet_empty(packet)) {
error = sctp_packet_transmit(packet, gfp);
+ if (error < 0)
+ asoc->base.sk->sk_err = -error;
+ }
/* Clear the burst limited state, if any */
sctp_transport_burst_reset(t);
}
-
- return error;
}
/* Update unack_data based on the incoming SACK chunk */
@@ -1467,7 +1437,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
* instance).
*/
if (!tchunk->tsn_gap_acked &&
- !tchunk->resent &&
+ !sctp_chunk_retransmitted(tchunk) &&
tchunk->rtt_in_progress) {
tchunk->rtt_in_progress = 0;
rtt = jiffies - tchunk->sent_at;
@@ -1747,7 +1717,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
{
int i;
sctp_sack_variable_t *frags;
- __u16 gap;
+ __u16 tsn_offset, blocks;
__u32 ctsn = ntohl(sack->cum_tsn_ack);
if (TSN_lte(tsn, ctsn))
@@ -1766,10 +1736,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
*/
frags = sack->variable;
- gap = tsn - ctsn;
- for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
- if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
- TSN_lte(gap, ntohs(frags[i].gab.end)))
+ blocks = ntohs(sack->num_gap_ack_blocks);
+ tsn_offset = tsn - ctsn;
+ for (i = 0; i < blocks; ++i) {
+ if (tsn_offset >= ntohs(frags[i].gab.start) &&
+ tsn_offset <= ntohs(frags[i].gab.end))
goto pass;
}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ef8ba77a5bea..206377fe91ec 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -73,13 +73,17 @@ static const struct snmp_mib sctp_snmp_list[] = {
/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
{
+ unsigned long buff[SCTP_MIB_MAX];
struct net *net = seq->private;
int i;
- for (i = 0; sctp_snmp_list[i].name != NULL; i++)
+ memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX);
+
+ snmp_get_cpu_field_batch(buff, sctp_snmp_list,
+ net->sctp.sctp_statistics);
+ for (i = 0; sctp_snmp_list[i].name; i++)
seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
- snmp_fold_field(net->sctp.sctp_statistics,
- sctp_snmp_list[i].entry));
+ buff[i]);
return 0;
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 7b523e3f551f..616a9428e0c4 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -205,26 +205,30 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
if (!addr->valid)
continue;
- if (sctp_in_scope(net, &addr->a, scope)) {
- /* Now that the address is in scope, check to see if
- * the address type is really supported by the local
- * sock as well as the remote peer.
- */
- if ((((AF_INET == addr->a.sa.sa_family) &&
- (copy_flags & SCTP_ADDR4_PEERSUPP))) ||
- (((AF_INET6 == addr->a.sa.sa_family) &&
- (copy_flags & SCTP_ADDR6_ALLOWED) &&
- (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
- error = sctp_add_bind_addr(bp, &addr->a,
- sizeof(addr->a),
- SCTP_ADDR_SRC, GFP_ATOMIC);
- if (error)
- goto end_copy;
- }
- }
+ if (!sctp_in_scope(net, &addr->a, scope))
+ continue;
+
+ /* Now that the address is in scope, check to see if
+ * the address type is really supported by the local
+ * sock as well as the remote peer.
+ */
+ if (addr->a.sa.sa_family == AF_INET &&
+ !(copy_flags & SCTP_ADDR4_PEERSUPP))
+ continue;
+ if (addr->a.sa.sa_family == AF_INET6 &&
+ (!(copy_flags & SCTP_ADDR6_ALLOWED) ||
+ !(copy_flags & SCTP_ADDR6_PEERSUPP)))
+ continue;
+
+ if (sctp_bind_addr_state(bp, &addr->a) != -1)
+ continue;
+
+ error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
+ SCTP_ADDR_SRC, GFP_ATOMIC);
+ if (error)
+ break;
}
-end_copy:
rcu_read_unlock();
return error;
}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index cef0cee182d4..048954eee984 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
int portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct list_head *addr_list;
@@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
r->idiag_retrans = 0;
}
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@ -203,6 +204,7 @@ struct sctp_comm_param {
struct netlink_callback *cb;
const struct inet_diag_req_v2 *r;
const struct nlmsghdr *nlh;
+ bool net_admin;
};
static size_t inet_assoc_attr_size(struct sctp_association *asoc)
@@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(4) /* INET_DIAG_MARK */
+ nla_total_size(addrlen * asoc->peer.transport_count)
+ nla_total_size(addrlen * addrcnt)
+ nla_total_size(sizeof(struct inet_diag_meminfo))
@@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
err = inet_sctp_diag_fill(sk, assoc, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ commp->net_admin);
release_sock(sk);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
@@ -299,7 +303,8 @@ static int sctp_sock_dump(struct sock *sk, void *p)
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- NLM_F_MULTI, cb->nlh) < 0) {
+ NLM_F_MULTI, cb->nlh,
+ commp->net_admin) < 0) {
cb->args[3] = 1;
err = 1;
goto release;
@@ -309,7 +314,8 @@ static int sctp_sock_dump(struct sock *sk, void *p)
if (inet_sctp_diag_fill(sk, assoc, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+ cb->nlh->nlmsg_seq, 0, cb->nlh,
+ commp->net_admin) < 0) {
err = 1;
goto release;
}
@@ -389,7 +395,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh) < 0) {
+ cb->nlh, commp->net_admin) < 0) {
err = 2;
goto out;
}
@@ -426,6 +432,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
.skb = in_skb,
.r = req,
.nlh = nlh,
+ .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
};
if (req->sdiag_family == AF_INET) {
@@ -461,6 +468,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
.skb = skb,
.cb = cb,
.r = r,
+ .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
};
/* eps hashtable dumps
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 46ffecc57214..9e9690b7afe1 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
num_types = sp->pf->supported_addrs(sp, types);
chunksize = sizeof(init) + addrs_len;
- chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+ chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
chunksize += sizeof(ecap_param);
if (asoc->prsctp_enable)
@@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
/* Add HMACS parameter length if any were defined */
auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
if (auth_hmacs->length)
- chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
else
auth_hmacs = NULL;
/* Add CHUNKS parameter length */
auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
if (auth_chunks->length)
- chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
else
auth_chunks = NULL;
@@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
/* If we have any extensions to report, account for that */
if (num_ext)
- chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
- num_ext);
+ chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
/* RFC 2960 3.3.2 Initiation (INIT) (1)
*
@@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
if (auth_hmacs->length)
- chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
else
auth_hmacs = NULL;
auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
if (auth_chunks->length)
- chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
else
auth_chunks = NULL;
@@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
}
if (num_ext)
- chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
- num_ext);
+ chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
/* Now allocate and fill out the chunk. */
retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -1375,7 +1375,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
struct sock *sk;
/* No need to allocate LL here, as this is only a chunk. */
- skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
+ skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
if (!skb)
goto nodata;
@@ -1467,7 +1467,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
void *target;
void *padding;
int chunklen = ntohs(chunk->chunk_hdr->length);
- int padlen = WORD_ROUND(chunklen) - chunklen;
+ int padlen = SCTP_PAD4(chunklen) - chunklen;
padding = skb_put(chunk->skb, padlen);
target = skb_put(chunk->skb, len);
@@ -1885,7 +1885,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc,
struct __sctp_missing report;
__u16 len;
- len = WORD_ROUND(sizeof(report));
+ len = SCTP_PAD4(sizeof(report));
/* Make an ERROR chunk, preparing enough room for
* returning multiple unknown parameters.
@@ -2083,9 +2083,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
if (*errp) {
if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- WORD_ROUND(ntohs(param.p->length))))
+ SCTP_PAD4(ntohs(param.p->length))))
sctp_addto_chunk_fixed(*errp,
- WORD_ROUND(ntohs(param.p->length)),
+ SCTP_PAD4(ntohs(param.p->length)),
param.v);
} else {
/* If there is no memory for generating the ERROR
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 12d45193357c..c345bf153bed 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1020,19 +1020,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
* This way the whole message is queued up and bundling if
* encouraged for small fragments.
*/
-static int sctp_cmd_send_msg(struct sctp_association *asoc,
- struct sctp_datamsg *msg, gfp_t gfp)
+static void sctp_cmd_send_msg(struct sctp_association *asoc,
+ struct sctp_datamsg *msg, gfp_t gfp)
{
struct sctp_chunk *chunk;
- int error = 0;
-
- list_for_each_entry(chunk, &msg->chunks, frag_list) {
- error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
- if (error)
- break;
- }
- return error;
+ list_for_each_entry(chunk, &msg->chunks, frag_list)
+ sctp_outq_tail(&asoc->outqueue, chunk, gfp);
}
@@ -1427,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
local_cork = 1;
}
/* Send a chunk to our peer. */
- error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
- gfp);
+ sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp);
break;
case SCTP_CMD_SEND_PKT:
@@ -1682,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_FORCE_PRIM_RETRAN:
t = asoc->peer.retran_path;
asoc->peer.retran_path = asoc->peer.primary_path;
- error = sctp_outq_uncork(&asoc->outqueue, gfp);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
local_cork = 0;
asoc->peer.retran_path = t;
break;
@@ -1709,7 +1702,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_outq_cork(&asoc->outqueue);
local_cork = 1;
}
- error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
+ sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
break;
case SCTP_CMD_SEND_NEXT_ASCONF:
sctp_cmd_send_asconf(asoc);
@@ -1739,9 +1732,9 @@ out:
*/
if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
if (chunk->end_of_packet || chunk->singleton)
- error = sctp_outq_uncork(&asoc->outqueue, gfp);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
} else if (local_cork)
- error = sctp_outq_uncork(&asoc->outqueue, gfp);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
if (sp->data_ready_signalled)
sp->data_ready_signalled = 0;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d88bb2b0b699..8ec20a64a3f8 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3422,6 +3422,12 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
+ /* Report violation if chunk len overflows */
+ ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
+ if (ch_end > skb_tail_pointer(skb))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
/* Now that we know we at least have a chunk header,
* do things that are type appropriate.
*/
@@ -3453,12 +3459,6 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
}
}
- /* Report violation if chunk len overflows */
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
- if (ch_end > skb_tail_pointer(skb))
- return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
- commands);
-
ch = (sctp_chunkhdr_t *) ch_end;
} while (ch_end < skb_tail_pointer(skb));
@@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
hdr = unk_chunk->chunk_hdr;
err_chunk = sctp_make_op_error(asoc, unk_chunk,
SCTP_ERROR_UNKNOWN_CHUNK, hdr,
- WORD_ROUND(ntohs(hdr->length)),
+ SCTP_PAD4(ntohs(hdr->length)),
0);
if (err_chunk) {
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
hdr = unk_chunk->chunk_hdr;
err_chunk = sctp_make_op_error(asoc, unk_chunk,
SCTP_ERROR_UNKNOWN_CHUNK, hdr,
- WORD_ROUND(ntohs(hdr->length)),
+ SCTP_PAD4(ntohs(hdr->length)),
0);
if (err_chunk) {
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 8ed2d99bde6d..1b5d669e3029 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -235,8 +235,12 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
sctp_assoc_t id)
{
struct sctp_association *addr_asoc = NULL, *id_asoc = NULL;
- struct sctp_transport *transport;
+ struct sctp_af *af = sctp_get_af_specific(addr->ss_family);
union sctp_addr *laddr = (union sctp_addr *)addr;
+ struct sctp_transport *transport;
+
+ if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len))
+ return NULL;
addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
laddr,
@@ -1214,9 +1218,12 @@ static int __sctp_connect(struct sock *sk,
timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
- err = sctp_wait_for_connect(asoc, &timeo);
- if ((err == 0 || err == -EINPROGRESS) && assoc_id)
+ if (assoc_id)
*assoc_id = asoc->assoc_id;
+ err = sctp_wait_for_connect(asoc, &timeo);
+ /* Note: the asoc may be freed after the return of
+ * sctp_wait_for_connect.
+ */
/* Don't free association on exit. */
asoc = NULL;
@@ -1958,6 +1965,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
/* Now send the (possibly) fragmented message. */
list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+ sctp_chunk_hold(chunk);
+
/* Do accounting for the write space. */
sctp_set_owner_w(chunk);
@@ -1970,13 +1979,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
* breaks.
*/
err = sctp_primitive_SEND(net, asoc, datamsg);
- sctp_datamsg_put(datamsg);
/* Did the lower layer accept the chunk? */
- if (err)
+ if (err) {
+ sctp_datamsg_free(datamsg);
goto out_free;
+ }
pr_debug("%s: we sent primitively\n", __func__);
+ sctp_datamsg_put(datamsg);
err = msg_len;
if (unlikely(wait_connect)) {
@@ -4278,19 +4289,18 @@ static void sctp_shutdown(struct sock *sk, int how)
{
struct net *net = sock_net(sk);
struct sctp_endpoint *ep;
- struct sctp_association *asoc;
if (!sctp_style(sk, TCP))
return;
- if (how & SEND_SHUTDOWN) {
+ ep = sctp_sk(sk)->ep;
+ if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) {
+ struct sctp_association *asoc;
+
sk->sk_state = SCTP_SS_CLOSING;
- ep = sctp_sk(sk)->ep;
- if (!list_empty(&ep->asocs)) {
- asoc = list_entry(ep->asocs.next,
- struct sctp_association, asocs);
- sctp_primitive_SHUTDOWN(net, asoc, NULL);
- }
+ asoc = list_entry(ep->asocs.next,
+ struct sctp_association, asocs);
+ sctp_primitive_SHUTDOWN(net, asoc, NULL);
}
}
@@ -4386,10 +4396,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
{
int err;
- err = rhashtable_walk_init(&sctp_transport_hashtable, iter,
- GFP_KERNEL);
- if (err)
- return err;
+ rhltable_walk_enter(&sctp_transport_hashtable, iter);
err = rhashtable_walk_start(iter);
if (err && err != -EAGAIN) {
@@ -4469,21 +4476,17 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
const union sctp_addr *paddr, void *p)
{
struct sctp_transport *transport;
- int err = -ENOENT;
+ int err;
rcu_read_lock();
transport = sctp_addrs_lookup_transport(net, laddr, paddr);
- if (!transport || !sctp_transport_hold(transport))
- goto out;
-
- sctp_association_hold(transport->asoc);
- sctp_transport_put(transport);
-
rcu_read_unlock();
+ if (!transport)
+ return -ENOENT;
+
err = cb(transport, p);
- sctp_association_put(transport->asoc);
+ sctp_transport_put(transport);
-out:
return err;
}
EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
@@ -4683,7 +4686,7 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
int __user *optlen)
{
- if (len <= 0)
+ if (len == 0)
return -EINVAL;
if (len > sizeof(struct sctp_event_subscribe))
len = sizeof(struct sctp_event_subscribe);
@@ -6426,6 +6429,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
if (get_user(len, optlen))
return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
lock_sock(sk);
switch (optname) {
@@ -7420,7 +7426,8 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
*/
release_sock(sk);
current_timeo = schedule_timeout(current_timeo);
- BUG_ON(sk != asoc->base.sk);
+ if (sk != asoc->base.sk)
+ goto do_error;
lock_sock(sk);
*timeo_p = current_timeo;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 81b86678be4d..a1652ab63918 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
*/
peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
- peer->last_time_heard = ktime_set(0, 0);
+ peer->last_time_heard = 0;
peer->last_time_ecne_reduced = jiffies;
peer->param_flags = SPP_HB_DISABLE |
@@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
}
if (transport->dst) {
- transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+ transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
} else
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
@@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport,
return;
}
if (transport->dst) {
- transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+ transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
/* Initialize sk->sk_rcv_saddr, if the transport is the
* association's active path for getsockname().
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d85b803da11d..bea00058ce35 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
ch = (sctp_errhdr_t *)(chunk->skb->data);
cause = ch->cause;
- elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+ elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
/* Pull off the ERROR header. */
skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
@@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
* MUST ignore the padding bytes.
*/
len = ntohs(chunk->chunk_hdr->length);
- padding = WORD_ROUND(len) - len;
+ padding = SCTP_PAD4(len) - len;
/* Fixup cloned skb with just this chunks data. */
skb_trim(skb, chunk->chunk_end - padding - skb->data);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 877e55066f89..84d0fdaf7de9 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
* we can go ahead and clear out the lobby in one shot
*/
if (!skb_queue_empty(&sp->pd_lobby)) {
- struct list_head *list;
skb_queue_splice_tail_init(&sp->pd_lobby,
&sk->sk_receive_queue);
- list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
- INIT_LIST_HEAD(list);
return 1;
}
} else {
diff --git a/net/socket.c b/net/socket.c
index a1bd16106625..0758e13754e2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -90,7 +90,7 @@
#include <linux/slab.h>
#include <linux/xattr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <net/compat.h>
@@ -320,11 +320,53 @@ static const struct dentry_operations sockfs_dentry_operations = {
.d_dname = sockfs_dname,
};
+static int sockfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *suffix, void *value, size_t size)
+{
+ if (value) {
+ if (dentry->d_name.len + 1 > size)
+ return -ERANGE;
+ memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
+ }
+ return dentry->d_name.len + 1;
+}
+
+#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
+#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
+#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
+
+static const struct xattr_handler sockfs_xattr_handler = {
+ .name = XATTR_NAME_SOCKPROTONAME,
+ .get = sockfs_xattr_get,
+};
+
+static int sockfs_security_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *suffix, const void *value,
+ size_t size, int flags)
+{
+ /* Handled by LSM. */
+ return -EAGAIN;
+}
+
+static const struct xattr_handler sockfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .set = sockfs_security_xattr_set,
+};
+
+static const struct xattr_handler *sockfs_xattr_handlers[] = {
+ &sockfs_xattr_handler,
+ &sockfs_security_xattr_handler,
+ NULL
+};
+
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "socket:", &sockfs_ops,
- &sockfs_dentry_operations, SOCKFS_MAGIC);
+ return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
+ sockfs_xattr_handlers,
+ &sockfs_dentry_operations, SOCKFS_MAGIC);
}
static struct vfsmount *sock_mnt __read_mostly;
@@ -463,35 +505,6 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
return NULL;
}
-#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
-#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
-#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
-static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- const char *proto_name;
- size_t proto_size;
- int error;
-
- error = -ENODATA;
- if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) {
- proto_name = dentry->d_name.name;
- proto_size = strlen(proto_name);
-
- if (value) {
- error = -ERANGE;
- if (proto_size + 1 > size)
- goto out;
-
- strncpy(value, proto_name, proto_size + 1);
- }
- error = proto_size + 1;
- }
-
-out:
- return error;
-}
-
static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
size_t size)
{
@@ -520,9 +533,22 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
return used;
}
+static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int err = simple_setattr(dentry, iattr);
+
+ if (!err && (iattr->ia_valid & ATTR_UID)) {
+ struct socket *sock = SOCKET_I(d_inode(dentry));
+
+ sock->sk->sk_uid = iattr->ia_uid;
+ }
+
+ return err;
+}
+
static const struct inode_operations sockfs_inode_ops = {
- .getxattr = sockfs_getxattr,
.listxattr = sockfs_listxattr,
+ .setattr = sockfs_setattr,
};
/**
@@ -642,7 +668,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
/* Race occurred between timestamp enabling and packet
receiving. Fill in the current time for now. */
- if (need_software_tstamp && skb->tstamp.tv64 == 0)
+ if (need_software_tstamp && skb->tstamp == 0)
__net_timestamp(skb);
if (need_software_tstamp) {
@@ -667,9 +693,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
empty = 0;
- if (!empty)
+ if (!empty) {
put_cmsg(msg, SOL_SOCKET,
SCM_TIMESTAMPING, sizeof(tss), &tss);
+
+ if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS))
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
+ skb->len, skb->data);
+ }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
@@ -880,6 +911,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
* what to do with it - that's up to the protocol still.
*/
+static struct ns_common *get_net_ns(struct ns_common *ns)
+{
+ return &get_net(container_of(ns, struct net, ns))->ns;
+}
+
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct socket *sock;
@@ -948,6 +984,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
err = dlci_ioctl_hook(cmd, argp);
mutex_unlock(&dlci_ioctl_mutex);
break;
+ case SIOCGSKNS:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+
+ err = open_related_ns(&net->ns, get_net_ns);
+ break;
default:
err = sock_do_ioctl(net, sock, cmd, arg);
break;
@@ -1875,7 +1918,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct sockaddr_storage address;
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
unsigned char ctl[sizeof(struct cmsghdr) + 20]
- __attribute__ ((aligned(sizeof(__kernel_size_t))));
+ __aligned(sizeof(__kernel_size_t));
/* 20 is size of ipv6_pktinfo */
unsigned char *ctl_buf = ctl;
int ctl_len;
@@ -2041,6 +2084,8 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
if (err)
break;
++datagrams;
+ if (msg_data_left(&msg_sys))
+ break;
cond_resched();
}
@@ -3096,6 +3141,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCSIFVLAN:
case SIOCADDDLCI:
case SIOCDELDLCI:
+ case SIOCGSKNS:
return sock_ioctl(file, cmd, arg);
case SIOCGIFFLAGS:
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
new file mode 100644
index 000000000000..6cff3f6d0c3a
--- /dev/null
+++ b/net/strparser/Kconfig
@@ -0,0 +1,4 @@
+
+config STREAM_PARSER
+ tristate
+ default n
diff --git a/net/strparser/Makefile b/net/strparser/Makefile
new file mode 100644
index 000000000000..858a126ebaa0
--- /dev/null
+++ b/net/strparser/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_STREAM_PARSER) += strparser.o
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
new file mode 100644
index 000000000000..41adf362936d
--- /dev/null
+++ b/net/strparser/strparser.c
@@ -0,0 +1,510 @@
+/*
+ * Stream Parser
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/strparser.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+
+static struct workqueue_struct *strp_wq;
+
+struct _strp_rx_msg {
+ /* Internal cb structure. struct strp_rx_msg must be first for passing
+ * to upper layer.
+ */
+ struct strp_rx_msg strp;
+ int accum_len;
+ int early_eaten;
+};
+
+static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb)
+{
+ return (struct _strp_rx_msg *)((void *)skb->cb +
+ offsetof(struct qdisc_skb_cb, data));
+}
+
+/* Lower lock held */
+static void strp_abort_rx_strp(struct strparser *strp, int err)
+{
+ struct sock *csk = strp->sk;
+
+ /* Unrecoverable error in receive */
+
+ del_timer(&strp->rx_msg_timer);
+
+ if (strp->rx_stopped)
+ return;
+
+ strp->rx_stopped = 1;
+
+ /* Report an error on the lower socket */
+ csk->sk_err = err;
+ csk->sk_error_report(csk);
+}
+
+static void strp_start_rx_timer(struct strparser *strp)
+{
+ if (strp->sk->sk_rcvtimeo)
+ mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo);
+}
+
+/* Lower lock held */
+static void strp_parser_err(struct strparser *strp, int err,
+ read_descriptor_t *desc)
+{
+ desc->error = err;
+ kfree_skb(strp->rx_skb_head);
+ strp->rx_skb_head = NULL;
+ strp->cb.abort_parser(strp, err);
+}
+
+static inline int strp_peek_len(struct strparser *strp)
+{
+ struct socket *sock = strp->sk->sk_socket;
+
+ return sock->ops->peek_len(sock);
+}
+
+/* Lower socket lock held */
+static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+ unsigned int orig_offset, size_t orig_len)
+{
+ struct strparser *strp = (struct strparser *)desc->arg.data;
+ struct _strp_rx_msg *rxm;
+ struct sk_buff *head, *skb;
+ size_t eaten = 0, cand_len;
+ ssize_t extra;
+ int err;
+ bool cloned_orig = false;
+
+ if (strp->rx_paused)
+ return 0;
+
+ head = strp->rx_skb_head;
+ if (head) {
+ /* Message already in progress */
+
+ rxm = _strp_rx_msg(head);
+ if (unlikely(rxm->early_eaten)) {
+ /* Already some number of bytes on the receive sock
+ * data saved in rx_skb_head, just indicate they
+ * are consumed.
+ */
+ eaten = orig_len <= rxm->early_eaten ?
+ orig_len : rxm->early_eaten;
+ rxm->early_eaten -= eaten;
+
+ return eaten;
+ }
+
+ if (unlikely(orig_offset)) {
+ /* Getting data with a non-zero offset when a message is
+ * in progress is not expected. If it does happen, we
+ * need to clone and pull since we can't deal with
+ * offsets in the skbs for a message expect in the head.
+ */
+ orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!orig_skb) {
+ STRP_STATS_INCR(strp->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ if (!pskb_pull(orig_skb, orig_offset)) {
+ STRP_STATS_INCR(strp->stats.rx_mem_fail);
+ kfree_skb(orig_skb);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ cloned_orig = true;
+ orig_offset = 0;
+ }
+
+ if (!strp->rx_skb_nextp) {
+ /* We are going to append to the frags_list of head.
+ * Need to unshare the frag_list.
+ */
+ err = skb_unclone(head, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.rx_mem_fail);
+ desc->error = err;
+ return 0;
+ }
+
+ if (unlikely(skb_shinfo(head)->frag_list)) {
+ /* We can't append to an sk_buff that already
+ * has a frag_list. We create a new head, point
+ * the frag_list of that to the old head, and
+ * then are able to use the old head->next for
+ * appending to the message.
+ */
+ if (WARN_ON(head->next)) {
+ desc->error = -EINVAL;
+ return 0;
+ }
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb) {
+ STRP_STATS_INCR(strp->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ skb->len = head->len;
+ skb->data_len = head->len;
+ skb->truesize = head->truesize;
+ *_strp_rx_msg(skb) = *_strp_rx_msg(head);
+ strp->rx_skb_nextp = &head->next;
+ skb_shinfo(skb)->frag_list = head;
+ strp->rx_skb_head = skb;
+ head = skb;
+ } else {
+ strp->rx_skb_nextp =
+ &skb_shinfo(head)->frag_list;
+ }
+ }
+ }
+
+ while (eaten < orig_len) {
+ /* Always clone since we will consume something */
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb) {
+ STRP_STATS_INCR(strp->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ break;
+ }
+
+ cand_len = orig_len - eaten;
+
+ head = strp->rx_skb_head;
+ if (!head) {
+ head = skb;
+ strp->rx_skb_head = head;
+ /* Will set rx_skb_nextp on next packet if needed */
+ strp->rx_skb_nextp = NULL;
+ rxm = _strp_rx_msg(head);
+ memset(rxm, 0, sizeof(*rxm));
+ rxm->strp.offset = orig_offset + eaten;
+ } else {
+ /* Unclone since we may be appending to an skb that we
+ * already share a frag_list with.
+ */
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.rx_mem_fail);
+ desc->error = err;
+ break;
+ }
+
+ rxm = _strp_rx_msg(head);
+ *strp->rx_skb_nextp = skb;
+ strp->rx_skb_nextp = &skb->next;
+ head->data_len += skb->len;
+ head->len += skb->len;
+ head->truesize += skb->truesize;
+ }
+
+ if (!rxm->strp.full_len) {
+ ssize_t len;
+
+ len = (*strp->cb.parse_msg)(strp, head);
+
+ if (!len) {
+ /* Need more header to determine length */
+ if (!rxm->accum_len) {
+ /* Start RX timer for new message */
+ strp_start_rx_timer(strp);
+ }
+ rxm->accum_len += cand_len;
+ eaten += cand_len;
+ STRP_STATS_INCR(strp->stats.rx_need_more_hdr);
+ WARN_ON(eaten != orig_len);
+ break;
+ } else if (len < 0) {
+ if (len == -ESTRPIPE && rxm->accum_len) {
+ len = -ENODATA;
+ strp->rx_unrecov_intr = 1;
+ } else {
+ strp->rx_interrupted = 1;
+ }
+ strp_parser_err(strp, len, desc);
+ break;
+ } else if (len > strp->sk->sk_rcvbuf) {
+ /* Message length exceeds maximum allowed */
+ STRP_STATS_INCR(strp->stats.rx_msg_too_big);
+ strp_parser_err(strp, -EMSGSIZE, desc);
+ break;
+ } else if (len <= (ssize_t)head->len -
+ skb->len - rxm->strp.offset) {
+ /* Length must be into new skb (and also
+ * greater than zero)
+ */
+ STRP_STATS_INCR(strp->stats.rx_bad_hdr_len);
+ strp_parser_err(strp, -EPROTO, desc);
+ break;
+ }
+
+ rxm->strp.full_len = len;
+ }
+
+ extra = (ssize_t)(rxm->accum_len + cand_len) -
+ rxm->strp.full_len;
+
+ if (extra < 0) {
+ /* Message not complete yet. */
+ if (rxm->strp.full_len - rxm->accum_len >
+ strp_peek_len(strp)) {
+ /* Don't have the whole messages in the socket
+ * buffer. Set strp->rx_need_bytes to wait for
+ * the rest of the message. Also, set "early
+ * eaten" since we've already buffered the skb
+ * but don't consume yet per strp_read_sock.
+ */
+
+ if (!rxm->accum_len) {
+ /* Start RX timer for new message */
+ strp_start_rx_timer(strp);
+ }
+
+ strp->rx_need_bytes = rxm->strp.full_len -
+ rxm->accum_len;
+ rxm->accum_len += cand_len;
+ rxm->early_eaten = cand_len;
+ STRP_STATS_ADD(strp->stats.rx_bytes, cand_len);
+ desc->count = 0; /* Stop reading socket */
+ break;
+ }
+ rxm->accum_len += cand_len;
+ eaten += cand_len;
+ WARN_ON(eaten != orig_len);
+ break;
+ }
+
+ /* Positive extra indicates ore bytes than needed for the
+ * message
+ */
+
+ WARN_ON(extra > cand_len);
+
+ eaten += (cand_len - extra);
+
+ /* Hurray, we have a new message! */
+ del_timer(&strp->rx_msg_timer);
+ strp->rx_skb_head = NULL;
+ STRP_STATS_INCR(strp->stats.rx_msgs);
+
+ /* Give skb to upper layer */
+ strp->cb.rcv_msg(strp, head);
+
+ if (unlikely(strp->rx_paused)) {
+ /* Upper layer paused strp */
+ break;
+ }
+ }
+
+ if (cloned_orig)
+ kfree_skb(orig_skb);
+
+ STRP_STATS_ADD(strp->stats.rx_bytes, eaten);
+
+ return eaten;
+}
+
+static int default_read_sock_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+/* Called with lock held on lower socket */
+static int strp_read_sock(struct strparser *strp)
+{
+ struct socket *sock = strp->sk->sk_socket;
+ read_descriptor_t desc;
+
+ desc.arg.data = strp;
+ desc.error = 0;
+ desc.count = 1; /* give more than one skb per call */
+
+ /* sk should be locked here, so okay to do read_sock */
+ sock->ops->read_sock(strp->sk, &desc, strp_recv);
+
+ desc.error = strp->cb.read_sock_done(strp, desc.error);
+
+ return desc.error;
+}
+
+/* Lower sock lock held */
+void strp_data_ready(struct strparser *strp)
+{
+ if (unlikely(strp->rx_stopped))
+ return;
+
+ /* This check is needed to synchronize with do_strp_rx_work.
+ * do_strp_rx_work acquires a process lock (lock_sock) whereas
+ * the lock held here is bh_lock_sock. The two locks can be
+ * held by different threads at the same time, but bh_lock_sock
+ * allows a thread in BH context to safely check if the process
+ * lock is held. In this case, if the lock is held, queue work.
+ */
+ if (sock_owned_by_user(strp->sk)) {
+ queue_work(strp_wq, &strp->rx_work);
+ return;
+ }
+
+ if (strp->rx_paused)
+ return;
+
+ if (strp->rx_need_bytes) {
+ if (strp_peek_len(strp) >= strp->rx_need_bytes)
+ strp->rx_need_bytes = 0;
+ else
+ return;
+ }
+
+ if (strp_read_sock(strp) == -ENOMEM)
+ queue_work(strp_wq, &strp->rx_work);
+}
+EXPORT_SYMBOL_GPL(strp_data_ready);
+
+static void do_strp_rx_work(struct strparser *strp)
+{
+ read_descriptor_t rd_desc;
+ struct sock *csk = strp->sk;
+
+ /* We need the read lock to synchronize with strp_data_ready. We
+ * need the socket lock for calling strp_read_sock.
+ */
+ lock_sock(csk);
+
+ if (unlikely(strp->rx_stopped))
+ goto out;
+
+ if (strp->rx_paused)
+ goto out;
+
+ rd_desc.arg.data = strp;
+
+ if (strp_read_sock(strp) == -ENOMEM)
+ queue_work(strp_wq, &strp->rx_work);
+
+out:
+ release_sock(csk);
+}
+
+static void strp_rx_work(struct work_struct *w)
+{
+ do_strp_rx_work(container_of(w, struct strparser, rx_work));
+}
+
+static void strp_rx_msg_timeout(unsigned long arg)
+{
+ struct strparser *strp = (struct strparser *)arg;
+
+ /* Message assembly timed out */
+ STRP_STATS_INCR(strp->stats.rx_msg_timeouts);
+ lock_sock(strp->sk);
+ strp->cb.abort_parser(strp, ETIMEDOUT);
+ release_sock(strp->sk);
+}
+
+int strp_init(struct strparser *strp, struct sock *csk,
+ struct strp_callbacks *cb)
+{
+ struct socket *sock = csk->sk_socket;
+
+ if (!cb || !cb->rcv_msg || !cb->parse_msg)
+ return -EINVAL;
+
+ if (!sock->ops->read_sock || !sock->ops->peek_len)
+ return -EAFNOSUPPORT;
+
+ memset(strp, 0, sizeof(*strp));
+
+ strp->sk = csk;
+
+ setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout,
+ (unsigned long)strp);
+
+ INIT_WORK(&strp->rx_work, strp_rx_work);
+
+ strp->cb.rcv_msg = cb->rcv_msg;
+ strp->cb.parse_msg = cb->parse_msg;
+ strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
+ strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(strp_init);
+
+void strp_unpause(struct strparser *strp)
+{
+ strp->rx_paused = 0;
+
+ /* Sync setting rx_paused with RX work */
+ smp_mb();
+
+ queue_work(strp_wq, &strp->rx_work);
+}
+EXPORT_SYMBOL_GPL(strp_unpause);
+
+/* strp must already be stopped so that strp_recv will no longer be called.
+ * Note that strp_done is not called with the lower socket held.
+ */
+void strp_done(struct strparser *strp)
+{
+ WARN_ON(!strp->rx_stopped);
+
+ del_timer_sync(&strp->rx_msg_timer);
+ cancel_work_sync(&strp->rx_work);
+
+ if (strp->rx_skb_head) {
+ kfree_skb(strp->rx_skb_head);
+ strp->rx_skb_head = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(strp_done);
+
+void strp_stop(struct strparser *strp)
+{
+ strp->rx_stopped = 1;
+}
+EXPORT_SYMBOL_GPL(strp_stop);
+
+void strp_check_rcv(struct strparser *strp)
+{
+ queue_work(strp_wq, &strp->rx_work);
+}
+EXPORT_SYMBOL_GPL(strp_check_rcv);
+
+static int __init strp_mod_init(void)
+{
+ strp_wq = create_singlethread_workqueue("kstrp");
+
+ return 0;
+}
+
+static void __exit strp_mod_exit(void)
+{
+}
+module_init(strp_mod_init);
+module_exit(strp_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a7e42f9a405c..2bff63a73cf8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
*entry, *new;
unsigned int nr;
- nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+ nr = auth->au_ops->hash_cred(acred, cache->hashbits);
rcu_read_lock();
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 168219535a34..f1df9837f1ac 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
return auth->au_ops->lookup_cred(auth, acred, lookupflags);
}
+static int
+generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ return hash_64(from_kgid(&init_user_ns, acred->gid) |
+ ((u64)from_kuid(&init_user_ns, acred->uid) <<
+ (sizeof(gid_t) * 8)), hashbits);
+}
+
/*
* Lookup generic creds for current process
*/
@@ -176,8 +184,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
goto out_nomatch;
for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
- if (!gid_eq(GROUP_AT(gcred->acred.group_info, i),
- GROUP_AT(acred->group_info, i)))
+ if (!gid_eq(gcred->acred.group_info->gid[i],
+ acred->group_info->gid[i]))
goto out_nomatch;
}
out_match:
@@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
static const struct rpc_authops generic_auth_ops = {
.owner = THIS_MODULE,
.au_name = "Generic",
+ .hash_cred = generic_hash_cred,
.lookup_cred = generic_lookup_cred,
.crcreate = generic_create_cred,
.key_timeout = generic_key_timeout,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 976c7812bbd5..cdeb1d814833 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -50,7 +50,7 @@
#include <linux/workqueue.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/gss_api.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/hashtable.h>
#include "../netns.h"
@@ -541,9 +541,13 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
return gss_new;
gss_msg = gss_add_msg(gss_new);
if (gss_msg == gss_new) {
- int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
+ int res;
+ atomic_inc(&gss_msg->count);
+ res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
if (res) {
gss_unhash_msg(gss_new);
+ atomic_dec(&gss_msg->count);
+ gss_release_msg(gss_new);
gss_msg = ERR_PTR(res);
}
} else
@@ -836,6 +840,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
warn_gssd();
gss_release_msg(gss_msg);
}
+ gss_release_msg(gss_msg);
}
static void gss_pipe_dentry_destroy(struct dentry *dir,
@@ -1298,6 +1303,12 @@ gss_destroy_cred(struct rpc_cred *cred)
gss_destroy_nullcred(cred);
}
+static int
+gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
+}
+
/*
* Lookup RPCSEC_GSS cred for the current process
*/
@@ -1610,7 +1621,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- __be32 seq;
+ __be32 *seq = NULL;
struct kvec iov;
struct xdr_buf verf_buf;
struct xdr_netobj mic;
@@ -1625,9 +1636,12 @@ gss_validate(struct rpc_task *task, __be32 *p)
goto out_bad;
if (flav != RPC_AUTH_GSS)
goto out_bad;
- seq = htonl(task->tk_rqstp->rq_seqno);
- iov.iov_base = &seq;
- iov.iov_len = sizeof(seq);
+ seq = kmalloc(4, GFP_NOFS);
+ if (!seq)
+ goto out_bad;
+ *seq = htonl(task->tk_rqstp->rq_seqno);
+ iov.iov_base = seq;
+ iov.iov_len = 4;
xdr_buf_from_iov(&iov, &verf_buf);
mic.data = (u8 *)p;
mic.len = len;
@@ -1647,11 +1661,13 @@ gss_validate(struct rpc_task *task, __be32 *p)
gss_put_ctx(ctx);
dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n",
task->tk_pid, __func__);
+ kfree(seq);
return p + XDR_QUADLEN(len);
out_bad:
gss_put_ctx(ctx);
dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__,
PTR_ERR(ret));
+ kfree(seq);
return ret;
}
@@ -1982,6 +1998,7 @@ static const struct rpc_authops authgss_ops = {
.au_name = "RPCSEC_GSS",
.create = gss_create,
.destroy = gss_destroy,
+ .hash_cred = gss_hash_cred,
.lookup_cred = gss_lookup_cred,
.crcreate = gss_create_cred,
.list_pseudoflavors = gss_mech_list_pseudoflavors,
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 244245bcbbd2..fb39284ec174 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -166,8 +166,8 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
unsigned int usage, struct xdr_netobj *cksumout)
{
struct scatterlist sg[1];
- int err;
- u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ int err = -1;
+ u8 *checksumdata;
u8 rc4salt[4];
struct crypto_ahash *md5;
struct crypto_ahash *hmac_md5;
@@ -187,23 +187,22 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ if (!checksumdata)
+ return GSS_S_FAILURE;
+
md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(md5))
- return GSS_S_FAILURE;
+ goto out_free_cksum;
hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
CRYPTO_ALG_ASYNC);
- if (IS_ERR(hmac_md5)) {
- crypto_free_ahash(md5);
- return GSS_S_FAILURE;
- }
+ if (IS_ERR(hmac_md5))
+ goto out_free_md5;
- req = ahash_request_alloc(md5, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(hmac_md5);
- crypto_free_ahash(md5);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(md5, GFP_NOFS);
+ if (!req)
+ goto out_free_hmac_md5;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -231,12 +230,9 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
goto out;
ahash_request_free(req);
- req = ahash_request_alloc(hmac_md5, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(hmac_md5);
- crypto_free_ahash(md5);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(hmac_md5, GFP_NOFS);
+ if (!req)
+ goto out_free_hmac_md5;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -258,8 +254,12 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
cksumout->len = kctx->gk5e->cksumlength;
out:
ahash_request_free(req);
- crypto_free_ahash(md5);
+out_free_hmac_md5:
crypto_free_ahash(hmac_md5);
+out_free_md5:
+ crypto_free_ahash(md5);
+out_free_cksum:
+ kfree(checksumdata);
return err ? GSS_S_FAILURE : 0;
}
@@ -276,8 +276,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
struct crypto_ahash *tfm;
struct ahash_request *req;
struct scatterlist sg[1];
- int err;
- u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ int err = -1;
+ u8 *checksumdata;
unsigned int checksumlen;
if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
@@ -291,15 +291,17 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ if (checksumdata == NULL)
+ return GSS_S_FAILURE;
+
tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
- return GSS_S_FAILURE;
+ goto out_free_cksum;
- req = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(tfm);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(tfm, GFP_NOFS);
+ if (!req)
+ goto out_free_ahash;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -349,7 +351,10 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
cksumout->len = kctx->gk5e->cksumlength;
out:
ahash_request_free(req);
+out_free_ahash:
crypto_free_ahash(tfm);
+out_free_cksum:
+ kfree(checksumdata);
return err ? GSS_S_FAILURE : 0;
}
@@ -368,8 +373,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
struct crypto_ahash *tfm;
struct ahash_request *req;
struct scatterlist sg[1];
- int err;
- u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ int err = -1;
+ u8 *checksumdata;
unsigned int checksumlen;
if (kctx->gk5e->keyed_cksum == 0) {
@@ -383,16 +388,18 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ if (!checksumdata)
+ return GSS_S_FAILURE;
+
tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
- return GSS_S_FAILURE;
+ goto out_free_cksum;
checksumlen = crypto_ahash_digestsize(tfm);
- req = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- crypto_free_ahash(tfm);
- return GSS_S_FAILURE;
- }
+ req = ahash_request_alloc(tfm, GFP_NOFS);
+ if (!req)
+ goto out_free_ahash;
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
@@ -433,7 +440,10 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
}
out:
ahash_request_free(req);
+out_free_ahash:
crypto_free_ahash(tfm);
+out_free_cksum:
+ kfree(checksumdata);
return err ? GSS_S_FAILURE : 0;
}
@@ -666,14 +676,17 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
u32 ret;
struct scatterlist sg[1];
SKCIPHER_REQUEST_ON_STACK(req, cipher);
- u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2];
+ u8 *data;
struct page **save_pages;
u32 len = buf->len - offset;
- if (len > ARRAY_SIZE(data)) {
+ if (len > GSS_KRB5_MAX_BLOCKSIZE * 2) {
WARN_ON(0);
return -ENOMEM;
}
+ data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
/*
* For encryption, we want to read from the cleartext
@@ -708,6 +721,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
ret = write_bytes_to_xdr_buf(buf, offset, data, len);
out:
+ kfree(data);
return ret;
}
@@ -949,7 +963,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
}
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate shash descriptor for '%s'\n",
__func__, kctx->gk5e->cksum_name);
@@ -1016,7 +1030,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
}
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate shash descriptor for '%s'\n",
__func__, kctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 60595835317a..7bb2514aadd9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -451,8 +451,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
goto out_err_free_hmac;
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate hash descriptor for '%s'\n",
__func__, ctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index eeeba5adee6d..25d9a9cf7b66 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
kgid = make_kgid(&init_user_ns, tmp);
if (!gid_valid(kgid))
goto out_free_groups;
- GROUP_AT(creds->cr_group_info, i) = kgid;
+ creds->cr_group_info->gid[i] = kgid;
}
return 0;
@@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
if (!oa->data)
return -ENOMEM;
- creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+ creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
if (!creds) {
kfree(oa->data);
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index d8582028b346..153082598522 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd,
kgid = make_kgid(&init_user_ns, id);
if (!gid_valid(kgid))
goto out;
- GROUP_AT(rsci.cred.cr_group_info, i) = kgid;
+ rsci.cred.cr_group_info->gid[i] = kgid;
}
/* mech name */
@@ -718,30 +718,37 @@ gss_write_null_verf(struct svc_rqst *rqstp)
static int
gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq)
{
- __be32 xdr_seq;
+ __be32 *xdr_seq;
u32 maj_stat;
struct xdr_buf verf_data;
struct xdr_netobj mic;
__be32 *p;
struct kvec iov;
+ int err = -1;
svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS);
- xdr_seq = htonl(seq);
+ xdr_seq = kmalloc(4, GFP_KERNEL);
+ if (!xdr_seq)
+ return -1;
+ *xdr_seq = htonl(seq);
- iov.iov_base = &xdr_seq;
- iov.iov_len = sizeof(xdr_seq);
+ iov.iov_base = xdr_seq;
+ iov.iov_len = 4;
xdr_buf_from_iov(&iov, &verf_data);
p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
mic.data = (u8 *)(p + 1);
maj_stat = gss_get_mic(ctx_id, &verf_data, &mic);
if (maj_stat != GSS_S_COMPLETE)
- return -1;
+ goto out;
*p++ = htonl(mic.len);
memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len);
p += XDR_QUADLEN(mic.len);
if (!xdr_ressize_check(rqstp, p))
- return -1;
- return 0;
+ goto out;
+ err = 0;
+out:
+ kfree(xdr_seq);
+ return err;
}
struct gss_domain {
@@ -1482,7 +1489,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
case RPC_GSS_PROC_DESTROY:
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
- rsci->h.expiry_time = get_seconds();
+ rsci->h.expiry_time = seconds_since_boot();
set_bit(CACHE_NEGATIVE, &rsci->h.flags);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
@@ -1541,7 +1548,7 @@ complete:
ret = SVC_COMPLETE;
goto out;
drop:
- ret = SVC_DROP;
+ ret = SVC_CLOSE;
out:
if (rsci)
cache_put(&rsci->h, sn->rsc_cache);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index a99278c984e8..306fc0f54596 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth)
rpcauth_clear_credcache(auth->au_credcache);
}
+static int
+unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+ return hash_64(from_kgid(&init_user_ns, acred->gid) |
+ ((u64)from_kuid(&init_user_ns, acred->uid) <<
+ (sizeof(gid_t) * 8)), hashbits);
+}
+
/*
* Lookup AUTH_UNIX creds for current process
*/
@@ -79,7 +87,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
cred->uc_gid = acred->gid;
for (i = 0; i < groups; i++)
- cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+ cred->uc_gids[i] = acred->group_info->gid[i];
if (i < NFS_NGROUPS)
cred->uc_gids[i] = INVALID_GID;
@@ -127,7 +135,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
if (groups > NFS_NGROUPS)
groups = NFS_NGROUPS;
for (i = 0; i < groups ; i++)
- if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
+ if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
return 0;
if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
return 0;
@@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = {
.au_name = "UNIX",
.create = unx_create,
.destroy = unx_destroy,
+ .hash_cred = unx_hash_cred,
.lookup_cred = unx_lookup_cred,
.crcreate = unx_create_cred,
};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 229956bf8457..ac701c28f44f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
page = alloc_page(gfp_flags);
if (page == NULL)
return -ENOMEM;
- buf->head[0].iov_base = page_address(page);
- buf->head[0].iov_len = PAGE_SIZE;
- buf->tail[0].iov_base = NULL;
- buf->tail[0].iov_len = 0;
- buf->page_len = 0;
- buf->len = 0;
- buf->buflen = PAGE_SIZE;
+ xdr_buf_init(buf, page_address(page), PAGE_SIZE);
return 0;
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4d8e11f94a35..f39e3e11f9aa 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -21,7 +21,7 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string_helpers.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
@@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
spin_unlock(&cache_list_lock);
/* start the cleaning process */
- schedule_delayed_work(&cache_cleaner, 0);
+ queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0);
}
EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
@@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work)
delay = 0;
if (delay)
- schedule_delayed_work(&cache_cleaner, delay);
+ queue_delayed_work(system_power_efficient_wq,
+ &cache_cleaner, delay);
}
@@ -1357,7 +1358,7 @@ static int c_show(struct seq_file *m, void *p)
ifdebug(CACHE)
seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
- atomic_read(&cp->ref.refcount), cp->flags);
+ kref_read(&cp->ref), cp->flags);
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 66f23b376fa0..1dc9f3bac099 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
struct super_block *sb)
{
struct dentry *dentry;
- int err = 0;
switch (event) {
case RPC_PIPEFS_MOUNT:
@@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
return -ENOTSUPP;
}
- return err;
+ return 0;
}
static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
@@ -337,6 +336,11 @@ out:
static DEFINE_IDA(rpc_clids);
+void rpc_cleanup_clids(void)
+{
+ ida_destroy(&rpc_clids);
+}
+
static int rpc_alloc_clid(struct rpc_clnt *clnt)
{
int clid;
@@ -988,7 +992,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
{
if (clnt != NULL) {
- rpc_task_release_client(task);
if (task->tk_xprt == NULL)
task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
task->tk_client = clnt;
@@ -1693,6 +1696,7 @@ call_allocate(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+ int status;
dprint_status(task);
@@ -1718,11 +1722,14 @@ call_allocate(struct rpc_task *task)
req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
req->rq_rcvsize <<= 2;
- req->rq_buffer = xprt->ops->buf_alloc(task,
- req->rq_callsize + req->rq_rcvsize);
- if (req->rq_buffer != NULL)
- return;
+ status = xprt->ops->buf_alloc(task);
xprt_inject_disconnect(xprt);
+ if (status == 0)
+ return;
+ if (status != -ENOMEM) {
+ rpc_exit(task, status);
+ return;
+ }
dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
@@ -1748,18 +1755,6 @@ rpc_task_force_reencode(struct rpc_task *task)
task->tk_rqstp->rq_bytes_sent = 0;
}
-static inline void
-rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
-{
- buf->head[0].iov_base = start;
- buf->head[0].iov_len = len;
- buf->tail[0].iov_len = 0;
- buf->page_len = 0;
- buf->flags = 0;
- buf->len = 0;
- buf->buflen = len;
-}
-
/*
* 3. Encode arguments of an RPC call
*/
@@ -1772,12 +1767,12 @@ rpc_xdr_encode(struct rpc_task *task)
dprint_status(task);
- rpc_xdr_buf_init(&req->rq_snd_buf,
- req->rq_buffer,
- req->rq_callsize);
- rpc_xdr_buf_init(&req->rq_rcv_buf,
- (char *)req->rq_buffer + req->rq_callsize,
- req->rq_rcvsize);
+ xdr_buf_init(&req->rq_snd_buf,
+ req->rq_buffer,
+ req->rq_callsize);
+ xdr_buf_init(&req->rq_rcv_buf,
+ req->rq_rbuffer,
+ req->rq_rcvsize);
p = rpc_encode_header(task);
if (p == NULL) {
@@ -1936,6 +1931,8 @@ call_connect_status(struct rpc_task *task)
case -EADDRINUSE:
case -ENOBUFS:
case -EPIPE:
+ xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
+ task->tk_rqstp->rq_connect_cookie);
if (RPC_IS_SOFTCONN(task))
break;
/* retry with existing socket, after a delay */
@@ -2616,6 +2613,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
/**
+ * rpc_clnt_setup_test_and_add_xprt()
+ *
+ * This is an rpc_clnt_add_xprt setup() function which returns 1 so:
+ * 1) caller of the test function must dereference the rpc_xprt_switch
+ * and the rpc_xprt.
+ * 2) test function must call rpc_xprt_switch_add_xprt, usually in
+ * the rpc_call_done routine.
+ *
+ * Upon success (return of 1), the test function adds the new
+ * transport to the rpc_clnt xprt switch
+ *
+ * @clnt: struct rpc_clnt to get the new transport
+ * @xps: the rpc_xprt_switch to hold the new transport
+ * @xprt: the rpc_xprt to test
+ * @data: a struct rpc_add_xprt_test pointer that holds the test function
+ * and test function call data
+ */
+int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
+ struct rpc_xprt_switch *xps,
+ struct rpc_xprt *xprt,
+ void *data)
+{
+ struct rpc_cred *cred;
+ struct rpc_task *task;
+ struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
+ int status = -EADDRINUSE;
+
+ xprt = xprt_get(xprt);
+ xprt_switch_get(xps);
+
+ if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+ goto out_err;
+
+ /* Test the connection */
+ cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+ task = rpc_call_null_helper(clnt, xprt, cred,
+ RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+ NULL, NULL);
+ put_rpccred(cred);
+ if (IS_ERR(task)) {
+ status = PTR_ERR(task);
+ goto out_err;
+ }
+ status = task->tk_status;
+ rpc_put_task(task);
+
+ if (status < 0)
+ goto out_err;
+
+ /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
+ xtest->add_xprt_test(clnt, xprt, xtest->data);
+
+ /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
+ return 1;
+out_err:
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+ pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n",
+ status, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ return status;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
+
+/**
* rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
* @clnt: pointer to struct rpc_clnt
* @xprtargs: pointer to struct xprt_create
@@ -2697,6 +2758,37 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
}
EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
+{
+ rcu_read_lock();
+ xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
+
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ rcu_read_lock();
+ rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+ xprt);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
+
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+ const struct sockaddr *sap)
+{
+ struct rpc_xprt_switch *xps;
+ bool ret;
+
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ ret = rpc_xprt_switch_has_addr(xps, sap);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static void rpc_show_header(void)
{
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index df5826876535..394ce523174c 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -34,7 +34,7 @@ struct sunrpc_net {
struct proc_dir_entry *use_gssp_proc;
};
-extern int sunrpc_net_id;
+extern unsigned int sunrpc_net_id;
int ip_map_cache_create(struct net *);
void ip_map_cache_destroy(struct net *);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 84f98cbe31c3..61a504fb1ae2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -477,7 +477,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9ae588511aaf..5db68b371db2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work)
}
/**
- * rpc_malloc - allocate an RPC buffer
- * @task: RPC task that will use this buffer
- * @size: requested byte size
+ * rpc_malloc - allocate RPC buffer resources
+ * @task: RPC task
+ *
+ * A single memory region is allocated, which is split between the
+ * RPC call and RPC reply that this task is being used for. When
+ * this RPC is retired, the memory is released by calling rpc_free.
*
* To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL and suppressing warning if the request cannot be serviced
- * immediately.
- * The caller can arrange to sleep in a way that is safe for rpciod.
+ * returning -ENOMEM and suppressing warning if the request cannot
+ * be serviced immediately. The caller can arrange to sleep in a
+ * way that is safe for rpciod.
*
* Most requests are 'small' (under 2KiB) and can be serviced from a
* mempool, ensuring that NFS reads and writes can always proceed,
@@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work)
* In order to avoid memory starvation triggering more writebacks of
* NFS requests, we avoid using GFP_KERNEL.
*/
-void *rpc_malloc(struct rpc_task *task, size_t size)
+int rpc_malloc(struct rpc_task *task)
{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
struct rpc_buffer *buf;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
@@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
buf = kmalloc(size, gfp);
if (!buf)
- return NULL;
+ return -ENOMEM;
buf->len = size;
dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
task->tk_pid, size, buf);
- return &buf->data;
+ rqst->rq_buffer = buf->data;
+ rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+ return 0;
}
EXPORT_SYMBOL_GPL(rpc_malloc);
/**
- * rpc_free - free buffer allocated via rpc_malloc
- * @buffer: buffer to free
+ * rpc_free - free RPC buffer resources allocated via rpc_malloc
+ * @task: RPC task
*
*/
-void rpc_free(void *buffer)
+void rpc_free(struct rpc_task *task)
{
+ void *buffer = task->tk_rqstp->rq_buffer;
size_t size;
struct rpc_buffer *buf;
- if (!buffer)
- return;
-
buf = container_of(buffer, struct rpc_buffer, data);
size = buf->len;
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2ecb994314c1..caeb01ad2b5a 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -157,15 +157,17 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
spin_lock(&op_metrics->om_lock);
op_metrics->om_ops++;
- op_metrics->om_ntrans += req->rq_ntrans;
+ /* kernel API: om_ops must never become larger than om_ntrans */
+ op_metrics->om_ntrans += max(req->rq_ntrans, 1);
op_metrics->om_timeouts += task->tk_timeouts;
op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
- delta = ktime_sub(req->rq_xtime, task->tk_start);
- op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
-
+ if (ktime_to_ns(req->rq_xtime)) {
+ delta = ktime_sub(req->rq_xtime, task->tk_start);
+ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+ }
op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
delta = ktime_sub(now, task->tk_start);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index ee5d3d253102..c73de181467a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,7 +24,7 @@
#include "netns.h"
-int sunrpc_net_id;
+unsigned int sunrpc_net_id;
EXPORT_SYMBOL_GPL(sunrpc_net_id);
static __net_init int sunrpc_init_net(struct net *net)
@@ -119,6 +119,7 @@ out:
static void __exit
cleanup_sunrpc(void)
{
+ rpc_cleanup_clids();
rpcauth_remove_module();
cleanup_socket_xprt();
svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4f4056..75f290bddca1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net)
}
EXPORT_SYMBOL_GPL(svc_bind);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+ INIT_LIST_HEAD(&serv->sv_cb_list);
+ spin_lock_init(&serv->sv_cb_lock);
+ init_waitqueue_head(&serv->sv_cb_waitq);
+}
+#else
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+}
+#endif
+
/*
* Create an RPC service
*/
@@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
init_timer(&serv->sv_temptimer);
spin_lock_init(&serv->sv_lock);
+ __svc_init_bc(serv);
+
serv->sv_nrpools = npools;
serv->sv_pools =
kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
@@ -1138,8 +1155,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
case SVC_DENIED:
goto err_bad_auth;
case SVC_CLOSE:
- if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
- svc_close_xprt(rqstp->rq_xprt);
+ goto close;
case SVC_DROP:
goto dropit;
case SVC_COMPLETE:
@@ -1229,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
sendit:
if (svc_authorise(rqstp))
- goto dropit;
+ goto close;
return 1; /* Caller can now send it */
dropit:
@@ -1237,11 +1253,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
dprintk("svc: svc_process dropit\n");
return 0;
+ close:
+ if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ svc_close_xprt(rqstp->rq_xprt);
+ dprintk("svc: svc_process close\n");
+ return 0;
+
err_short_len:
svc_printk(rqstp, "short len %Zd, dropping request\n",
argv->iov_len);
-
- goto dropit; /* drop request */
+ goto close;
err_bad_rpc:
serv->sv_stats->rpcbadfmt++;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c3f652395a80..7bfe1fb42add 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
svc_xprt_get(xprt);
dprintk("svc: transport %p dequeued, inuse=%d\n",
- xprt, atomic_read(&xprt->xpt_ref.refcount));
+ xprt, kref_read(&xprt->xpt_ref));
}
spin_unlock_bh(&pool->sp_lock);
out:
@@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
dprintk("svc_recv: found XPT_CLOSE\n");
+ if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags))
+ xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
svc_delete_xprt(xprt);
/* Leave XPT_BUSY set on the dead xprt: */
goto out;
@@ -820,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
rqstp->rq_deferred = svc_deferred_dequeue(xprt);
if (rqstp->rq_deferred)
len = svc_deferred_recv(rqstp);
@@ -978,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure)
* through, close it. */
if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
continue;
- if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+ if (kref_read(&xprt->xpt_ref) > 1 ||
test_bit(XPT_BUSY, &xprt->xpt_flags))
continue;
list_del_init(le);
@@ -1002,14 +1004,8 @@ static void svc_age_temp_xprts(unsigned long closure)
void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
{
struct svc_xprt *xprt;
- struct svc_sock *svsk;
- struct socket *sock;
struct list_head *le, *next;
LIST_HEAD(to_be_closed);
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
spin_lock_bh(&serv->sv_lock);
list_for_each_safe(le, next, &serv->sv_tempsocks) {
@@ -1026,12 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
le = to_be_closed.next;
list_del_init(le);
xprt = list_entry(le, struct svc_xprt, xpt_list);
- dprintk("svc_age_temp_xprts_now: closing %p\n", xprt);
- svsk = container_of(xprt, struct svc_sock, sk_xprt);
- sock = svsk->sk_sock;
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
- svc_close_xprt(xprt);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ set_bit(XPT_KILL_TEMP, &xprt->xpt_flags);
+ dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n",
+ xprt);
+ svc_xprt_enqueue(xprt);
}
}
EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 69841db1f533..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -124,16 +124,20 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
#define DN_HASHMAX (1<<DN_HASHBITS)
static struct hlist_head auth_domain_table[DN_HASHMAX];
-static spinlock_t auth_domain_lock =
- __SPIN_LOCK_UNLOCKED(auth_domain_lock);
+static DEFINE_SPINLOCK(auth_domain_lock);
+
+static void auth_domain_release(struct kref *kref)
+{
+ struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
+
+ hlist_del(&dom->hash);
+ dom->flavour->domain_release(dom);
+ spin_unlock(&auth_domain_lock);
+}
void auth_domain_put(struct auth_domain *dom)
{
- if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
- hlist_del(&dom->hash);
- dom->flavour->domain_release(dom);
- spin_unlock(&auth_domain_lock);
- }
+ kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
}
EXPORT_SYMBOL_GPL(auth_domain_put);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index dfacdc95b3f5..64af4f034de6 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd,
kgid = make_kgid(&init_user_ns, gid);
if (!gid_valid(kgid))
goto out;
- GROUP_AT(ug.gi, i) = kgid;
+ ug.gi->gid[i] = kgid;
}
ugp = unix_gid_lookup(cd, uid);
@@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m,
seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
for (i = 0; i < glen; i++)
- seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i)));
+ seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i]));
seq_printf(m, "\n");
return 0;
}
@@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
return SVC_CLOSE;
for (i = 0; i < slen; i++) {
kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
- GROUP_AT(cred->cr_group_info, i) = kgid;
+ cred->cr_group_info->gid[i] = kgid;
}
if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
*authp = rpc_autherr_badverf;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 57625f64efd5..de066acdb34e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -39,9 +39,10 @@
#include <net/checksum.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/udp.h>
#include <net/tcp.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <trace/events/skb.h>
@@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp)
}
}
+static void svc_release_udp_skb(struct svc_rqst *rqstp)
+{
+ struct sk_buff *skb = rqstp->rq_xprt_ctxt;
+
+ if (skb) {
+ rqstp->rq_xprt_ctxt = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ consume_skb(skb);
+ }
+}
+
union svc_pktinfo_u {
struct in_pktinfo pkti;
struct in6_pktinfo pkti6;
@@ -438,6 +451,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}
+static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk;
+ struct socket *sock;
+ struct linger no_linger = {
+ .l_onoff = 1,
+ .l_linger = 0,
+ };
+
+ svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ sock = svsk->sk_sock;
+ kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+ (char *)&no_linger, sizeof(no_linger));
+}
+
/*
* See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
*/
@@ -534,7 +562,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
0, 0, MSG_PEEK | MSG_DONTWAIT);
if (err >= 0)
- skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+ skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
if (skb == NULL) {
if (err != -EAGAIN) {
@@ -546,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
}
len = svc_addr_len(svc_addr(rqstp));
rqstp->rq_addrlen = len;
- if (skb->tstamp.tv64 == 0) {
+ if (skb->tstamp == 0) {
skb->tstamp = ktime_get_real();
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
@@ -575,7 +603,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
goto out_free;
}
local_bh_enable();
- skb_free_datagram_locked(svsk->sk_sk, skb);
+ consume_skb(skb);
} else {
/* we can use it in-place */
rqstp->rq_arg.head[0].iov_base = skb->data;
@@ -602,8 +630,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
return len;
out_free:
- trace_kfree_skb(skb, svc_udp_recvfrom);
- skb_free_datagram_locked(svsk->sk_sk, skb);
+ kfree_skb(skb);
return 0;
}
@@ -648,6 +675,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
return NULL;
}
+static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -660,13 +691,14 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
- .xpo_release_rqst = svc_release_skb,
+ .xpo_release_rqst = svc_release_udp_skb,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
.xpo_has_wspace = svc_udp_has_wspace,
.xpo_accept = svc_udp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt,
};
static struct svc_xprt_class svc_udp_class = {
@@ -1242,6 +1274,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt,
};
static struct svc_xprt_class svc_tcp_class = {
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index c88d9bc06f5c..8c3936403fea 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -14,7 +14,7 @@
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/stats.h>
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c4f3cc0c0775..7f1071e103ca 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
newbase -= xdr->buf->page_base;
if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
xdr_set_next_page(xdr);
else if (xdr->iov == xdr->buf->head) {
if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
}
return xdr->p != xdr->end;
}
@@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
{
__be32 *p;
- void *cpdest = xdr->scratch.iov_base;
+ char *cpdest = xdr->scratch.iov_base;
size_t cplen = (char *)xdr->end - (char *)xdr->p;
if (nbytes > xdr->scratch.iov_len)
return NULL;
- memcpy(cpdest, xdr->p, cplen);
+ p = __xdr_inline_decode(xdr, cplen);
+ if (p == NULL)
+ return NULL;
+ memcpy(cpdest, p, cplen);
cpdest += cplen;
nbytes -= cplen;
if (!xdr_set_next_buffer(xdr))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ea244b29138b..9a6be030ca7d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -669,7 +669,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
spin_lock_bh(&xprt->transport_lock);
if (cookie != xprt->connect_cookie)
goto out;
- if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt))
+ if (test_bit(XPRT_CLOSING, &xprt->state))
goto out;
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
@@ -772,6 +772,7 @@ void xprt_connect(struct rpc_task *task)
if (!xprt_connected(xprt)) {
task->tk_rqstp->rq_bytes_sent = 0;
task->tk_timeout = task->tk_rqstp->rq_timeout;
+ task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
if (test_bit(XPRT_CLOSING, &xprt->state))
@@ -1295,7 +1296,7 @@ void xprt_release(struct rpc_task *task)
xprt_schedule_autodisconnect(xprt);
spin_unlock_bh(&xprt->transport_lock);
if (req->rq_buffer)
- xprt->ops->buf_free(req->rq_buffer);
+ xprt->ops->buf_free(task);
xprt_inject_disconnect(xprt);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 66c9d63f4797..ae92a9e9ba52 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -15,6 +15,7 @@
#include <asm/cmpxchg.h>
#include <linux/spinlock.h>
#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtmultipath.h>
typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
@@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
if (xprt == NULL)
return;
spin_lock(&xps->xps_lock);
- if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
+ if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
+ !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
}
@@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
}
+bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+ const struct sockaddr *sap)
+{
+ struct list_head *head;
+ struct rpc_xprt *pos;
+
+ if (xps == NULL || sap == NULL)
+ return false;
+
+ head = &xps->xps_xprt_list;
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) {
+ pr_info("RPC: addr %s already in xprt switch\n",
+ pos->address_strings[RPC_DISPLAY_ADDR]);
+ return true;
+ }
+ }
+ return false;
+}
+
static
struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
const struct rpc_xprt *cur)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 87762d976b63..24fedd4b117e 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
list_del(&req->rl_all);
spin_unlock(&buf->rb_reqslock);
- rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+ rpcrdma_destroy_req(req);
kfree(rqst);
}
@@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
- struct xdr_buf *buf;
size_t size;
req = rpcrdma_create_req(r_xprt);
@@ -46,30 +44,20 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
return PTR_ERR(req);
req->rl_backchannel = true;
- size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
- rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+ DMA_TO_DEVICE, GFP_KERNEL);
if (IS_ERR(rb))
goto out_fail;
req->rl_rdmabuf = rb;
- size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
- rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ size = r_xprt->rx_data.inline_rsize;
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
if (IS_ERR(rb))
goto out_fail;
- rb->rg_owner = req;
req->rl_sendbuf = rb;
- /* so that rpcr_to_rdmar works when receiving a request */
- rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
-
- buf = &rqst->rq_snd_buf;
- buf->head[0].iov_base = rqst->rq_buffer;
- buf->head[0].iov_len = 0;
- buf->tail[0].iov_base = NULL;
- buf->tail[0].iov_len = 0;
- buf->page_len = 0;
- buf->len = 0;
- buf->buflen = size;
-
+ xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+ min_t(size_t, size, PAGE_SIZE));
+ rpcrdma_set_xprtdata(rqst, req);
return 0;
out_fail:
@@ -204,6 +192,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
size_t maxmsg;
maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -219,7 +208,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_msg *headerp;
- size_t rpclen;
headerp = rdmab_to_msg(req->rl_rdmabuf);
headerp->rm_xid = rqst->rq_xid;
@@ -231,26 +219,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
headerp->rm_body.rm_chunks[1] = xdr_zero;
headerp->rm_body.rm_chunks[2] = xdr_zero;
- rpclen = rqst->rq_svec[0].iov_len;
-
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
- pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
- __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
- pr_info("RPC: %s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
- pr_info("RPC: %s: RPC: %*ph\n",
- __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
-#endif
-
- req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
- req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
- req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
- req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
- req->rl_send_iov[1].length = rpclen;
- req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 2;
+ if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
+ &rqst->rq_snd_buf, rpcrdma_noch))
+ return -EIO;
return 0;
}
@@ -402,7 +373,7 @@ out_overflow:
out_short:
pr_warn("RPC/RDMA short backward direction call\n");
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
xprt_disconnect_done(xprt);
else
pr_warn("RPC: %s: reposting rep %p\n",
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 21cb3b150b37..1ebb09e1ac4f 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -160,9 +160,8 @@ static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
- rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
- RPCRDMA_MAX_DATA_SEGS /
- RPCRDMA_MAX_FMR_SGES));
+ ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+ RPCRDMA_MAX_FMR_SGES);
return 0;
}
@@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
list_for_each_entry(mw, &req->rl_registered, mw_list)
list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
+ r_xprt->rx_stats.local_inv_needed++;
rc = ib_unmap_fmr(&unmap_list);
if (rc)
goto out_reset;
@@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_init_mr = fmr_op_init_mr,
.ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
+ .ro_send_w_inv_ok = 0,
};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 892b5e1d9b09..47bed5333c7f 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -44,18 +44,20 @@
* being done.
*
* When the underlying transport disconnects, MRs are left in one of
- * three states:
+ * four states:
*
* INVALID: The MR was not in use before the QP entered ERROR state.
- * (Or, the LOCAL_INV WR has not completed or flushed yet).
- *
- * STALE: The MR was being registered or unregistered when the QP
- * entered ERROR state, and the pending WR was flushed.
*
* VALID: The MR was registered before the QP entered ERROR state.
*
- * When frwr_op_map encounters STALE and VALID MRs, they are recovered
- * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * FLUSHED_FR: The MR was being registered when the QP entered ERROR
+ * state, and the pending WR was flushed.
+ *
+ * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR
+ * state, and the pending WR was flushed.
+ *
+ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Because MR recovery
* allocates fresh resources, it is deferred to a workqueue, and the
* recovered MRs are placed back on the rb_mws list when recovery is
* complete. frwr_op_map allocates another MR for the current RPC while
@@ -67,6 +69,8 @@
* pending send queue WRs before the transport is reconnected.
*/
+#include <linux/sunrpc/rpc_rdma.h>
+
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -97,7 +101,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
struct rpcrdma_frmr *f = &r->frmr;
int rc;
- f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
if (IS_ERR(f->fr_mr))
goto out_mr_err;
@@ -153,7 +157,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return rc;
}
- f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
ia->ri_max_frmr_depth);
if (IS_ERR(f->fr_mr)) {
pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
@@ -161,26 +165,25 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return PTR_ERR(f->fr_mr);
}
- dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, f);
f->fr_state = FRMR_IS_INVALID;
return 0;
}
/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
- *
- * There's no recovery if this fails. The FRMR is abandoned, but
- * remains in rb_all. It will be cleaned up when the transport is
- * destroyed.
*/
static void
frwr_op_recover_mr(struct rpcrdma_mw *mw)
{
+ enum rpcrdma_frmr_state state = mw->frmr.fr_state;
struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc;
rc = __frwr_reset_mr(ia, mw);
- ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (state != FRMR_FLUSHED_LI)
+ ib_dma_unmap_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
if (rc)
goto out_release;
@@ -203,11 +206,16 @@ static int
frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
int depth, delta;
+ ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- ia->ri_device->attrs.max_fast_reg_page_list_len);
+ attrs->max_fast_reg_page_list_len);
dprintk("RPC: %s: device's max FR page list len = %u\n",
__func__, ia->ri_max_frmr_depth);
@@ -234,17 +242,16 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
}
ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
- cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
+ if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) {
+ cdata->max_requests = attrs->max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests *
depth;
}
- rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
- RPCRDMA_MAX_DATA_SEGS /
- ia->ri_max_frmr_depth));
+ ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+ ia->ri_max_frmr_depth);
return 0;
}
@@ -261,10 +268,8 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
}
static void
-__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr,
- const char *wr)
+__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
{
- frmr->fr_state = FRMR_IS_STALE;
if (wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("rpcrdma: %s: %s (%u/0x%x)\n",
wr, ib_wc_status_msg(wc->status),
@@ -287,7 +292,8 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS) {
cqe = wc->wr_cqe;
frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
- __frwr_sendcompletion_flush(wc, frmr, "fastreg");
+ frmr->fr_state = FRMR_FLUSHED_FR;
+ __frwr_sendcompletion_flush(wc, "fastreg");
}
}
@@ -307,7 +313,8 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS) {
cqe = wc->wr_cqe;
frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
- __frwr_sendcompletion_flush(wc, frmr, "localinv");
+ frmr->fr_state = FRMR_FLUSHED_LI;
+ __frwr_sendcompletion_flush(wc, "localinv");
}
}
@@ -327,9 +334,11 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
cqe = wc->wr_cqe;
frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
- if (wc->status != IB_WC_SUCCESS)
- __frwr_sendcompletion_flush(wc, frmr, "localinv");
- complete_all(&frmr->fr_linv_done);
+ if (wc->status != IB_WC_SUCCESS) {
+ frmr->fr_state = FRMR_FLUSHED_LI;
+ __frwr_sendcompletion_flush(wc, "localinv");
+ }
+ complete(&frmr->fr_linv_done);
}
/* Post a REG_MR Work Request to register a memory region
@@ -340,6 +349,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -375,8 +385,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
++seg;
++i;
-
- /* Check for holes */
+ if (holes_ok)
+ continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
@@ -396,7 +406,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
goto out_mapmr_err;
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, mw->mw_nents, mr->length);
+ __func__, frmr, mw->mw_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -413,7 +423,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
- DECR_CQCOUNT(&r_xprt->rx_ep);
+ rpcrdma_set_signaled(&r_xprt->rx_ep, &reg_wr->wr);
rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
if (rc)
goto out_senderr;
@@ -443,24 +453,6 @@ out_senderr:
return -ENOTCONN;
}
-static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
-{
- struct rpcrdma_frmr *f = &mw->frmr;
- struct ib_send_wr *invalidate_wr;
-
- f->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &f->fr_invwr;
-
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- f->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &f->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
-
- return invalidate_wr;
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -471,11 +463,12 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+ struct ib_send_wr *first, **prev, *last, *bad_wr;
+ struct rpcrdma_rep *rep = req->rl_reply;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- int rc;
+ int count, rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -485,32 +478,53 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* a single ib_post_send() call.
*/
f = NULL;
- invalidate_wrs = pos = prev = NULL;
+ count = 0;
+ prev = &first;
list_for_each_entry(mw, &req->rl_registered, mw_list) {
- pos = __frwr_prepare_linv_wr(mw);
+ mw->frmr.fr_state = FRMR_IS_INVALID;
+
+ if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
+ (mw->mw_handle == rep->rr_inv_rkey))
+ continue;
- if (!invalidate_wrs)
- invalidate_wrs = pos;
- else
- prev->next = pos;
- prev = pos;
f = &mw->frmr;
+ dprintk("RPC: %s: invalidating frmr %p\n",
+ __func__, f);
+
+ f->fr_cqe.done = frwr_wc_localinv;
+ last = &f->fr_invwr;
+ memset(last, 0, sizeof(*last));
+ last->wr_cqe = &f->fr_cqe;
+ last->opcode = IB_WR_LOCAL_INV;
+ last->ex.invalidate_rkey = mw->mw_handle;
+ count++;
+
+ *prev = last;
+ prev = &last->next;
}
+ if (!f)
+ goto unmap;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
* are complete.
*/
- f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+ last->send_flags = IB_SEND_SIGNALED;
f->fr_cqe.done = frwr_wc_localinv_wake;
reinit_completion(&f->fr_linv_done);
- INIT_CQCOUNT(&r_xprt->rx_ep);
+
+ /* Initialize CQ count, since there is always a signaled
+ * WR being posted here. The new cqcount depends on how
+ * many SQEs are about to be consumed.
+ */
+ rpcrdma_init_cqcount(&r_xprt->rx_ep, count);
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
* unless ri_id->qp is a valid pointer.
*/
- rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+ r_xprt->rx_stats.local_inv_needed++;
+ rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
if (rc)
goto reset_mrs;
@@ -521,6 +535,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
unmap:
list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ dprintk("RPC: %s: DMA unmapping frmr %p\n",
+ __func__, &mw->frmr);
list_del_init(&mw->mw_list);
ib_dma_unmap_sg(ia->ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -537,7 +553,7 @@ reset_mrs:
*/
list_for_each_entry(mw, &req->rl_registered, mw_list) {
f = &mw->frmr;
- if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ if (mw->mw_handle == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next;
}
@@ -576,4 +592,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_init_mr = frwr_op_init_mr,
.ro_release_mr = frwr_op_release_mr,
.ro_displayname = "frwr",
+ .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK,
};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a47f170b20ef..c52e0f2ffe52 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-enum rpcrdma_chunktype {
- rpcrdma_noch = 0,
- rpcrdma_readch,
- rpcrdma_areadch,
- rpcrdma_writech,
- rpcrdma_replych
-};
-
static const char transfertypes[][12] = {
"inline", /* no chunks */
"read list", /* some argument via rdma read */
@@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
return size;
}
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
- struct rpcrdma_create_data_internal *cdata,
- unsigned int maxsegs)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ unsigned int maxsegs = ia->ri_max_segs;
+
ia->ri_max_inline_write = cdata->inline_wsize -
rpcrdma_max_call_header_size(maxsegs);
ia->ri_max_inline_read = cdata->inline_rsize -
@@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
-static int
-rpcrdma_tail_pullup(struct xdr_buf *buf)
-{
- size_t tlen = buf->tail[0].iov_len;
- size_t skip = tlen & 3;
-
- /* Do not include the tail if it is only an XDR pad */
- if (tlen < 4)
- return 0;
-
- /* xdr_write_pages() adds a pad at the beginning of the tail
- * if the content in "buf->pages" is unaligned. Force the
- * tail's actual content to land at the next XDR position
- * after the head instead.
- */
- if (skip) {
- unsigned char *src, *dst;
- unsigned int count;
-
- src = buf->tail[0].iov_base;
- dst = buf->head[0].iov_base;
- dst += buf->head[0].iov_len;
-
- src += skip;
- tlen -= skip;
-
- dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
- __func__, skip, dst, src, tlen);
-
- for (count = tlen; count; count--)
- *dst++ = *src++;
- }
-
- return tlen;
-}
-
/* Split "vec" on page boundaries into segments. FMR registers pages,
* not a byte range. Other modes coalesce these segments into a single
* MR when they can.
@@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
static int
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
+ enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
+ bool reminv_expected)
{
int len, n, p, page_base;
struct page **ppages;
@@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (type == rpcrdma_readch)
return n;
+ /* When encoding the Write list, some servers need to see an extra
+ * segment for odd-length Write chunks. The upper layer provides
+ * space in the tail iovec for this purpose.
+ */
+ if (type == rpcrdma_writech && reminv_expected)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
if (rtype == rpcrdma_areadch)
pos = 0;
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg);
+ wtype, seg,
+ r_xprt->rx_ia.ri_reminv_expected);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+ r_xprt->rx_ia.ri_reminv_expected);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return iptr;
}
-/*
- * Copy write data inline.
- * This function is used for "small" requests. Data which is passed
- * to RPC via iovecs (or page list) is copied directly into the
- * pre-registered memory buffer for this request. For small amounts
- * of data, this is efficient. The cutoff value is tunable.
+/* Prepare the RPC-over-RDMA header SGE.
+ */
+static bool
+rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+ u32 len)
+{
+ struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+ struct ib_sge *sge = &req->rl_send_sge[0];
+
+ if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
+ if (!__rpcrdma_dma_map_regbuf(ia, rb))
+ return false;
+ sge->addr = rdmab_addr(rb);
+ sge->lkey = rdmab_lkey(rb);
+ }
+ sge->length = len;
+
+ ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+ sge->length, DMA_TO_DEVICE);
+ req->rl_send_wr.num_sge++;
+ return true;
+}
+
+/* Prepare the Send SGEs. The head and tail iovec, and each entry
+ * in the page list, gets its own SGE.
*/
-static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
+static bool
+rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+ struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
{
- int i, npages, curlen;
- int copy_len;
- unsigned char *srcp, *destp;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int page_base;
- struct page **ppages;
+ unsigned int sge_no, page_base, len, remaining;
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ struct ib_device *device = ia->ri_device;
+ struct ib_sge *sge = req->rl_send_sge;
+ u32 lkey = ia->ri_pd->local_dma_lkey;
+ struct page *page, **ppages;
+
+ /* The head iovec is straightforward, as it is already
+ * DMA-mapped. Sync the content that has changed.
+ */
+ if (!rpcrdma_dma_map_regbuf(ia, rb))
+ return false;
+ sge_no = 1;
+ sge[sge_no].addr = rdmab_addr(rb);
+ sge[sge_no].length = xdr->head[0].iov_len;
+ sge[sge_no].lkey = rdmab_lkey(rb);
+ ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+ sge[sge_no].length, DMA_TO_DEVICE);
+
+ /* If there is a Read chunk, the page list is being handled
+ * via explicit RDMA, and thus is skipped here. However, the
+ * tail iovec may include an XDR pad for the page list, as
+ * well as additional content, and may not reside in the
+ * same page as the head iovec.
+ */
+ if (rtype == rpcrdma_readch) {
+ len = xdr->tail[0].iov_len;
- destp = rqst->rq_svec[0].iov_base;
- curlen = rqst->rq_svec[0].iov_len;
- destp += curlen;
+ /* Do not include the tail if it is only an XDR pad */
+ if (len < 4)
+ goto out;
- dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
- __func__, destp, rqst->rq_slen, curlen);
+ page = virt_to_page(xdr->tail[0].iov_base);
+ page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
- copy_len = rqst->rq_snd_buf.page_len;
+ /* If the content in the page list is an odd length,
+ * xdr_write_pages() has added a pad at the beginning
+ * of the tail iovec. Force the tail's non-pad content
+ * to land at the next XDR position in the Send message.
+ */
+ page_base += len & 3;
+ len -= len & 3;
+ goto map_tail;
+ }
- if (rqst->rq_snd_buf.tail[0].iov_len) {
- curlen = rqst->rq_snd_buf.tail[0].iov_len;
- if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
- memmove(destp + copy_len,
- rqst->rq_snd_buf.tail[0].iov_base, curlen);
- r_xprt->rx_stats.pullup_copy_count += curlen;
+ /* If there is a page list present, temporarily DMA map
+ * and prepare an SGE for each page to be sent.
+ */
+ if (xdr->page_len) {
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ sge_no++;
+ if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
+ goto out_mapping_overflow;
+
+ len = min_t(u32, PAGE_SIZE - page_base, remaining);
+ sge[sge_no].addr = ib_dma_map_page(device, *ppages,
+ page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device, sge[sge_no].addr))
+ goto out_mapping_err;
+ sge[sge_no].length = len;
+ sge[sge_no].lkey = lkey;
+
+ req->rl_mapped_sges++;
+ ppages++;
+ remaining -= len;
+ page_base = 0;
}
- dprintk("RPC: %s: tail destp 0x%p len %d\n",
- __func__, destp + copy_len, curlen);
- rqst->rq_svec[0].iov_len += curlen;
}
- r_xprt->rx_stats.pullup_copy_count += copy_len;
- page_base = rqst->rq_snd_buf.page_base;
- ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
- page_base &= ~PAGE_MASK;
- npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
- for (i = 0; copy_len && i < npages; i++) {
- curlen = PAGE_SIZE - page_base;
- if (curlen > copy_len)
- curlen = copy_len;
- dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n",
- __func__, i, destp, copy_len, curlen);
- srcp = kmap_atomic(ppages[i]);
- memcpy(destp, srcp+page_base, curlen);
- kunmap_atomic(srcp);
- rqst->rq_svec[0].iov_len += curlen;
- destp += curlen;
- copy_len -= curlen;
- page_base = 0;
+ /* The tail iovec is not always constructed in the same
+ * page where the head iovec resides (see, for example,
+ * gss_wrap_req_priv). To neatly accommodate that case,
+ * DMA map it separately.
+ */
+ if (xdr->tail[0].iov_len) {
+ page = virt_to_page(xdr->tail[0].iov_base);
+ page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+ len = xdr->tail[0].iov_len;
+
+map_tail:
+ sge_no++;
+ sge[sge_no].addr = ib_dma_map_page(device, page,
+ page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device, sge[sge_no].addr))
+ goto out_mapping_err;
+ sge[sge_no].length = len;
+ sge[sge_no].lkey = lkey;
+ req->rl_mapped_sges++;
}
- /* header now contains entire send message */
+
+out:
+ req->rl_send_wr.num_sge = sge_no + 1;
+ return true;
+
+out_mapping_overflow:
+ pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
+ return false;
+
+out_mapping_err:
+ pr_err("rpcrdma: Send mapping error\n");
+ return false;
+}
+
+bool
+rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+ u32 hdrlen, struct xdr_buf *xdr,
+ enum rpcrdma_chunktype rtype)
+{
+ req->rl_send_wr.num_sge = 0;
+ req->rl_mapped_sges = 0;
+
+ if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
+ goto out_map;
+
+ if (rtype != rpcrdma_areadch)
+ if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
+ goto out_map;
+
+ return true;
+
+out_map:
+ pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+ return false;
+}
+
+void
+rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ struct ib_device *device = ia->ri_device;
+ struct ib_sge *sge;
+ int count;
+
+ sge = &req->rl_send_sge[2];
+ for (count = req->rl_mapped_sges; count--; sge++)
+ ib_dma_unmap_page(device, sge->addr, sge->length,
+ DMA_TO_DEVICE);
+ req->rl_mapped_sges = 0;
}
/*
* Marshal a request: the primary job of this routine is to choose
* the transfer modes. See comments below.
*
- * Prepares up to two IOVs per Call message:
- *
- * [0] -- RPC RDMA header
- * [1] -- the RPC header/data
- *
* Returns zero on success, otherwise a negative errno.
*/
@@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
- rpcrdma_inline_pullup(rqst);
- rpclen = rqst->rq_svec[0].iov_len;
+ rpclen = rqst->rq_snd_buf.len;
} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
- rpclen = rqst->rq_svec[0].iov_len;
- rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
+ rpclen = rqst->rq_snd_buf.head[0].iov_len +
+ rqst->rq_snd_buf.tail[0].iov_len;
} else {
r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
@@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
goto out_unmap;
hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
- if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
- goto out_overflow;
-
dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
rqst->rq_task->tk_pid, __func__,
transfertypes[rtype], transfertypes[wtype],
hdrlen, rpclen);
- req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
- req->rl_send_iov[0].length = hdrlen;
- req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
- req->rl_niovs = 1;
- if (rtype == rpcrdma_areadch)
- return 0;
-
- req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
- req->rl_send_iov[1].length = rpclen;
- req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 2;
+ if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+ &rqst->rq_snd_buf, rtype)) {
+ iptr = ERR_PTR(-EIO);
+ goto out_unmap;
+ }
return 0;
-out_overflow:
- pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
- hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
- iptr = ERR_PTR(-EIO);
-
out_unmap:
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
return PTR_ERR(iptr);
@@ -725,7 +786,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
ifdebug(FACILITY) {
u64 off;
xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
- dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
+ dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
__func__,
be32_to_cpu(seg->rs_length),
(unsigned long long)off,
@@ -845,28 +906,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
return fixup_copy_count;
}
-void
-rpcrdma_connect_worker(struct work_struct *work)
-{
- struct rpcrdma_ep *ep =
- container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
- struct rpcrdma_xprt *r_xprt =
- container_of(ep, struct rpcrdma_xprt, rx_ep);
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-
- spin_lock_bh(&xprt->transport_lock);
- if (++xprt->connect_cookie == 0) /* maintain a reserved value */
- ++xprt->connect_cookie;
- if (ep->rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt))
- xprt_wake_pending_tasks(xprt, 0);
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
- }
- spin_unlock_bh(&xprt->transport_lock);
-}
-
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
@@ -898,26 +937,16 @@ rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
-void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
-{
- schedule_delayed_work(&ep->rep_connect_worker, 0);
-}
-
/* Process received RPC/RDMA messages.
*
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
*/
void
-rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+rpcrdma_reply_handler(struct work_struct *work)
{
+ struct rpcrdma_rep *rep =
+ container_of(work, struct rpcrdma_rep, rr_work);
struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
@@ -1132,6 +1161,6 @@ out_duplicate:
repost:
r_xprt->rx_stats.bad_reply_count++;
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
rpcrdma_recv_buffer_put(rep);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a2a7519b0f23..288e35c2d8f4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
ret = -EIO;
goto out_unmap;
}
- atomic_inc(&rdma->sc_dma_used);
+ svc_rdma_count_mappings(rdma, ctxt);
memset(&send_wr, 0, sizeof(send_wr));
ctxt->cqe.done = svc_rdma_wc_send;
@@ -159,35 +159,40 @@ out_unmap:
/* Server-side transport endpoint wants a whole page for its send
* buffer. The client RPC code constructs the RPC header in this
* buffer before it invokes ->send_request.
- *
- * Returns NULL if there was a temporary allocation failure.
*/
-static void *
-xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
- struct svcxprt_rdma *rdma;
+ size_t size = rqst->rq_callsize;
struct page *page;
- rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
-
- /* Prevent an infinite loop: try to make this case work */
- if (size > PAGE_SIZE)
+ if (size > PAGE_SIZE) {
WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
size);
+ return -EINVAL;
+ }
+ /* svc_rdma_sendto releases this page */
page = alloc_page(RPCRDMA_DEF_GFP);
if (!page)
- return NULL;
+ return -ENOMEM;
+ rqst->rq_buffer = page_address(page);
- return page_address(page);
+ rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP);
+ if (!rqst->rq_rbuffer) {
+ put_page(page);
+ return -ENOMEM;
+ }
+ return 0;
}
static void
-xprt_rdma_bc_free(void *buffer)
+xprt_rdma_bc_free(struct rpc_task *task)
{
- /* No-op: ctxt and page have already been freed. */
+ struct rpc_rqst *rqst = task->tk_rqstp;
+
+ kfree(rqst->rq_rbuffer);
}
static int
@@ -350,6 +355,7 @@ xprt_setup_rdma_bc(struct xprt_create *args)
out_fail:
xprt_rdma_free_addresses(xprt);
args->bc_xprt->xpt_bc_xprt = NULL;
+ args->bc_xprt->xpt_bc_xps = NULL;
xprt_put(xprt);
xprt_free(xprt);
return ERR_PTR(-EINVAL);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c25606f2561..172b537f8cfc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
ctxt->sge[pno].addr);
if (ret)
goto err;
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[pno].length = len;
@@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
frmr->sg);
return -ENOMEM;
}
- atomic_inc(&xprt->sc_dma_used);
n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->sg_nents)) {
@@ -348,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
atomic_inc(&rdma_stat_read);
return ret;
err:
- ib_dma_unmap_sg(xprt->sc_cm_id->device,
- frmr->sg, frmr->sg_nents, frmr->direction);
svc_rdma_put_context(ctxt, 0);
svc_rdma_put_frmr(xprt, frmr);
return ret;
@@ -374,9 +371,7 @@ rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
u32 position, u32 byte_count, u32 page_offset, int page_no)
{
char *srcp, *destp;
- int ret;
- ret = 0;
srcp = head->arg.head[0].iov_base + position;
byte_count = head->arg.head[0].iov_len - position;
if (byte_count > PAGE_SIZE) {
@@ -415,6 +410,20 @@ done:
return 1;
}
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+static struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == xdr_zero)
+ return NULL;
+ return ch;
+}
+
static int rdma_read_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
struct svc_rqst *rqstp,
@@ -627,8 +636,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto defer;
goto out;
}
- dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
- ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n",
+ ctxt, rdma_xprt, rqstp);
atomic_inc(&rdma_stat_recv);
/* Build up the XDR from the receive buffers. */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 54d533300620..ad4d286a83c5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -153,76 +153,68 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
return dma_addr;
}
-/* Returns the address of the first read chunk or <nul> if no read chunk
- * is present
+/* Parse the RPC Call's transport header.
*/
-struct rpcrdma_read_chunk *
-svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
+ struct rpcrdma_write_array **write,
+ struct rpcrdma_write_array **reply)
{
- struct rpcrdma_read_chunk *ch =
- (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ __be32 *p;
- if (ch->rc_discrim == xdr_zero)
- return NULL;
- return ch;
-}
+ p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
-/* Returns the address of the first read write array element or <nul>
- * if no write array list is present
- */
-static struct rpcrdma_write_array *
-svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
-{
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] == xdr_zero)
- return NULL;
- return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+ /* Read list */
+ while (*p++ != xdr_zero)
+ p += 5;
+
+ /* Write list */
+ if (*p != xdr_zero) {
+ *write = (struct rpcrdma_write_array *)p;
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpu(*p) * 4;
+ } else {
+ *write = NULL;
+ p++;
+ }
+
+ /* Reply chunk */
+ if (*p != xdr_zero)
+ *reply = (struct rpcrdma_write_array *)p;
+ else
+ *reply = NULL;
}
-/* Returns the address of the first reply array element or <nul> if no
- * reply array is present
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one rkey to invalidate.
+ *
+ * Find a candidate rkey to invalidate when sending a reply. Picks the
+ * first rkey it finds in the chunks lists.
+ *
+ * Returns zero if RPC's chunk lists are empty.
*/
-static struct rpcrdma_write_array *
-svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array *wr_ary)
+static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_write_array *wr_ary,
+ struct rpcrdma_write_array *rp_ary)
{
- struct rpcrdma_read_chunk *rch;
- struct rpcrdma_write_array *rp_ary;
-
- /* XXX: Need to fix when reply chunk may occur with read list
- * and/or write list.
- */
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] != xdr_zero)
- return NULL;
-
- rch = svc_rdma_get_read_chunk(rmsgp);
- if (rch) {
- while (rch->rc_discrim != xdr_zero)
- rch++;
-
- /* The reply chunk follows an empty write array located
- * at 'rc_position' here. The reply array is at rc_target.
- */
- rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
- goto found_it;
- }
+ struct rpcrdma_read_chunk *rd_ary;
+ struct rpcrdma_segment *arg_ch;
- if (wr_ary) {
- int chunk = be32_to_cpu(wr_ary->wc_nchunks);
+ rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
+ if (rd_ary->rc_discrim != xdr_zero)
+ return be32_to_cpu(rd_ary->rc_target.rs_handle);
- rp_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[chunk].wc_target.rs_length;
- goto found_it;
+ if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
+ arg_ch = &wr_ary->wc_array[0].wc_target;
+ return be32_to_cpu(arg_ch->rs_handle);
}
- /* No read list, no write list */
- rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
+ if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
+ arg_ch = &rp_ary->wc_array[0].wc_target;
+ return be32_to_cpu(arg_ch->rs_handle);
+ }
- found_it:
- if (rp_ary->wc_discrim == xdr_zero)
- return NULL;
- return rp_ary;
+ return 0;
}
/* Assumptions:
@@ -280,7 +272,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
sge[sge_no].addr))
goto err;
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count++;
sge_off = 0;
@@ -464,7 +456,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct page *page,
struct rpcrdma_msg *rdma_resp,
struct svc_rdma_req_map *vec,
- int byte_count)
+ int byte_count,
+ u32 inv_rkey)
{
struct svc_rdma_op_ctxt *ctxt;
struct ib_send_wr send_wr;
@@ -489,7 +482,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[0].length, DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
goto err;
- atomic_inc(&rdma->sc_dma_used);
+ svc_rdma_count_mappings(rdma, ctxt);
ctxt->direction = DMA_TO_DEVICE;
@@ -505,7 +498,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
if (ib_dma_mapping_error(rdma->sc_cm_id->device,
ctxt->sge[sge_no].addr))
goto err;
- atomic_inc(&rdma->sc_dma_used);
+ svc_rdma_count_mappings(rdma, ctxt);
ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
@@ -523,23 +516,9 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
- /*
- * If there are more pages than SGE, terminate SGE
- * list so that svc_rdma_unmap_dma doesn't attempt to
- * unmap garbage.
- */
- if (page_no+1 >= sge_no)
- ctxt->sge[page_no+1].length = 0;
}
rqstp->rq_next_page = rqstp->rq_respages + 1;
- /* The loop above bumps sc_dma_used for each sge. The
- * xdr_buf.tail gets a separate sge, but resides in the
- * same page as xdr_buf.head. Don't count it twice.
- */
- if (sge_no > ctxt->count)
- atomic_dec(&rdma->sc_dma_used);
-
if (sge_no > rdma->sc_max_sge) {
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err;
@@ -549,7 +528,11 @@ static int send_reply(struct svcxprt_rdma *rdma,
send_wr.wr_cqe = &ctxt->cqe;
send_wr.sg_list = ctxt->sge;
send_wr.num_sge = sge_no;
- send_wr.opcode = IB_WR_SEND;
+ if (inv_rkey) {
+ send_wr.opcode = IB_WR_SEND_WITH_INV;
+ send_wr.ex.invalidate_rkey = inv_rkey;
+ } else
+ send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
ret = svc_rdma_send(rdma, &send_wr);
@@ -581,6 +564,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
int inline_bytes;
struct page *res_page;
struct svc_rdma_req_map *vec;
+ u32 inv_rkey;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -588,8 +572,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
* places this at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- wr_ary = svc_rdma_get_write_array(rdma_argp);
- rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
+
+ inv_rkey = 0;
+ if (rdma->sc_snd_w_inv)
+ inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
/* Build an req vec for the XDR */
vec = svc_rdma_get_req_map(rdma);
@@ -598,7 +585,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
inline_bytes = rqstp->rq_res.len;
- /* Create the RDMA response header */
+ /* Create the RDMA response header. xprt->xpt_mutex,
+ * acquired in svc_send(), serializes RPC replies. The
+ * code path below that inserts the credit grant value
+ * into each transport header runs only inside this
+ * critical section.
+ */
ret = -ENOMEM;
res_page = alloc_page(GFP_KERNEL);
if (!res_page)
@@ -633,9 +625,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err1;
ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
- inline_bytes);
+ inline_bytes, inv_rkey);
if (ret < 0)
- goto err1;
+ goto err0;
svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
@@ -692,7 +684,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
svc_rdma_put_context(ctxt, 1);
return;
}
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
/* Prepare SEND WR */
memset(&err_wr, 0, sizeof(err_wr));
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index dd9440137834..39652d390a9c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -41,6 +41,7 @@
*/
#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/interrupt.h>
@@ -67,6 +68,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
static int svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
static struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
@@ -79,6 +81,7 @@ static struct svc_xprt_ops svc_rdma_ops = {
.xpo_has_wspace = svc_rdma_has_wspace,
.xpo_accept = svc_rdma_accept,
.xpo_secure_port = svc_rdma_secure_port,
+ .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
};
struct svc_xprt_class svc_rdma_class = {
@@ -198,6 +201,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
out:
ctxt->count = 0;
+ ctxt->mapped_sges = 0;
ctxt->frmr = NULL;
return ctxt;
@@ -221,22 +225,24 @@ out_empty:
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
struct svcxprt_rdma *xprt = ctxt->xprt;
- int i;
- for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+ struct ib_device *device = xprt->sc_cm_id->device;
+ u32 lkey = xprt->sc_pd->local_dma_lkey;
+ unsigned int i;
+
+ for (i = 0; i < ctxt->mapped_sges; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
* the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
- atomic_dec(&xprt->sc_dma_used);
- ib_dma_unmap_page(xprt->sc_cm_id->device,
+ if (ctxt->sge[i].lkey == lkey)
+ ib_dma_unmap_page(device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
- }
}
+ ctxt->mapped_sges = 0;
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -390,7 +396,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wc->wr_cqe and wc->status are reliable */
ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- ctxt->wc_status = wc->status;
svc_rdma_unmap_dma(ctxt);
if (wc->status != IB_WC_SUCCESS)
@@ -428,7 +433,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt,
goto err;
out:
- atomic_dec(&xprt->sc_sq_count);
+ atomic_inc(&xprt->sc_sq_avail);
wake_up(&xprt->sc_send_wait);
return;
@@ -600,7 +605,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
goto err_put_ctxt;
- atomic_inc(&xprt->sc_dma_used);
+ svc_rdma_count_mappings(xprt, ctxt);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
@@ -642,6 +647,26 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
return ret;
}
+static void
+svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
+ struct rdma_conn_param *param)
+{
+ const struct rpcrdma_connect_private *pmsg = param->private_data;
+
+ if (pmsg &&
+ pmsg->cp_magic == rpcrdma_cmp_magic &&
+ pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+ newxprt->sc_snd_w_inv = pmsg->cp_flags &
+ RPCRDMA_CMP_F_SND_W_INV_OK;
+
+ dprintk("svcrdma: client send_size %u, recv_size %u "
+ "remote inv %ssupported\n",
+ rpcrdma_decode_buffer_size(pmsg->cp_send_size),
+ rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
+ newxprt->sc_snd_w_inv ? "" : "un");
+ }
+}
+
/*
* This function handles the CONNECT_REQUEST event on a listening
* endpoint. It is passed the cma_id for the _new_ connection. The context in
@@ -653,7 +678,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
* will call the recvfrom method on the listen xprt which will accept the new
* connection.
*/
-static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id,
+ struct rdma_conn_param *param)
{
struct svcxprt_rdma *listen_xprt = new_cma_id->context;
struct svcxprt_rdma *newxprt;
@@ -669,9 +695,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
new_cma_id->context = newxprt;
dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
newxprt, newxprt->sc_cm_id, listen_xprt);
+ svc_rdma_parse_connect_private(newxprt, param);
/* Save client advertised inbound read limit for use later in accept. */
- newxprt->sc_ord = client_ird;
+ newxprt->sc_ord = param->initiator_depth;
/* Set the local and remote addresses in the transport */
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
@@ -706,8 +733,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
"event = %s (%d)\n", cma_id, cma_id->context,
rdma_event_msg(event->event), event->event);
- handle_connect_req(cma_id,
- event->param.conn.initiator_depth);
+ handle_connect_req(cma_id, &event->param.conn);
break;
case RDMA_CM_EVENT_ESTABLISHED:
@@ -917,7 +943,6 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
if (frmr) {
ib_dma_unmap_sg(rdma->sc_cm_id->device,
frmr->sg, frmr->sg_nents, frmr->direction);
- atomic_dec(&rdma->sc_dma_used);
spin_lock_bh(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
@@ -941,8 +966,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
+ struct rpcrdma_connect_private pmsg;
struct ib_qp_init_attr qp_attr;
struct ib_device *dev;
+ struct sockaddr *sap;
unsigned int i;
int ret = 0;
@@ -980,6 +1007,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
@@ -993,7 +1021,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
- newxprt->sc_pd = ib_alloc_pd(dev);
+ newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
@@ -1022,18 +1050,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = newxprt->sc_sq_cq;
qp_attr.recv_cq = newxprt->sc_rq_cq;
- dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
- " cm_id->device=%p, sc_pd->device=%p\n"
- " cap.max_send_wr = %d\n"
- " cap.max_recv_wr = %d\n"
- " cap.max_send_sge = %d\n"
- " cap.max_recv_sge = %d\n",
- newxprt->sc_cm_id, newxprt->sc_pd,
- dev, newxprt->sc_pd->device,
- qp_attr.cap.max_send_wr,
- qp_attr.cap.max_recv_wr,
- qp_attr.cap.max_send_sge,
- qp_attr.cap.max_recv_sge);
+ dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
+ newxprt->sc_cm_id, newxprt->sc_pd);
+ dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+ qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+ dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+ qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
@@ -1070,7 +1092,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dev->attrs.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
newxprt->sc_reader = rdma_read_chunk_frmr;
- }
+ } else
+ newxprt->sc_snd_w_inv = false;
/*
* Determine if a DMA MR is required and if so, what privs are required
@@ -1094,11 +1117,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/* Swap out the handler */
newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+ /* Construct RDMA-CM private message */
+ pmsg.cp_magic = rpcrdma_cmp_magic;
+ pmsg.cp_version = RPCRDMA_CMP_VERSION;
+ pmsg.cp_flags = 0;
+ pmsg.cp_send_size = pmsg.cp_recv_size =
+ rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
+
/* Accept Connection */
set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
memset(&conn_param, 0, sizeof conn_param);
conn_param.responder_resources = 0;
conn_param.initiator_depth = newxprt->sc_ord;
+ conn_param.private_data = &pmsg;
+ conn_param.private_data_len = sizeof(pmsg);
ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
if (ret) {
dprintk("svcrdma: failed to accept new connection, ret=%d\n",
@@ -1106,31 +1138,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
- dprintk("svcrdma: new connection %p accepted with the following "
- "attributes:\n"
- " local_ip : %pI4\n"
- " local_port : %d\n"
- " remote_ip : %pI4\n"
- " remote_port : %d\n"
- " max_sge : %d\n"
- " max_sge_rd : %d\n"
- " sq_depth : %d\n"
- " max_requests : %d\n"
- " ord : %d\n",
- newxprt,
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_port),
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_port),
- newxprt->sc_max_sge,
- newxprt->sc_max_sge_rd,
- newxprt->sc_sq_depth,
- newxprt->sc_max_requests,
- newxprt->sc_ord);
+ dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
+ dprintk(" max_sge : %d\n", newxprt->sc_max_sge);
+ dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd);
+ dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
+ dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
+ dprintk(" ord : %d\n", newxprt->sc_ord);
return &newxprt->sc_xprt;
@@ -1184,9 +1201,9 @@ static void __svc_rdma_free(struct work_struct *work)
ib_drain_qp(rdma->sc_qp);
/* We should only be called from kref_put */
- if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+ if (kref_read(&xprt->xpt_ref) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
/*
* Destroy queued, but not processed read completions. Note
@@ -1217,9 +1234,6 @@ static void __svc_rdma_free(struct work_struct *work)
if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
rdma->sc_ctxt_used);
- if (atomic_read(&rdma->sc_dma_used) != 0)
- pr_err("svcrdma: dma still in use? (%d)\n",
- atomic_read(&rdma->sc_dma_used));
/* Final put of backchannel client transport */
if (xprt->xpt_bc_xprt) {
@@ -1279,6 +1293,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
return 1;
}
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
struct ib_send_wr *bad_wr, *n_wr;
@@ -1295,15 +1313,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
- spin_lock_bh(&xprt->sc_lock);
- if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
- spin_unlock_bh(&xprt->sc_lock);
+ if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
atomic_inc(&rdma_stat_sq_starve);
/* Wait until SQ WR available if SQ still full */
+ atomic_add(wr_count, &xprt->sc_sq_avail);
wait_event(xprt->sc_send_wait,
- atomic_read(&xprt->sc_sq_count) <
- xprt->sc_sq_depth);
+ atomic_read(&xprt->sc_sq_avail) > wr_count);
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
continue;
@@ -1313,21 +1329,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
svc_xprt_get(&xprt->sc_xprt);
/* Bump used SQ WR count and post */
- atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
if (ret) {
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- atomic_sub(wr_count, &xprt->sc_sq_count);
for (i = 0; i < wr_count; i ++)
svc_xprt_put(&xprt->sc_xprt);
- dprintk("svcrdma: failed to post SQ WR rc=%d, "
- "sc_sq_count=%d, sc_sq_depth=%d\n",
- ret, atomic_read(&xprt->sc_sq_count),
- xprt->sc_sq_depth);
- }
- spin_unlock_bh(&xprt->sc_lock);
- if (ret)
+ dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
+ dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n",
+ atomic_read(&xprt->sc_sq_avail),
+ xprt->sc_sq_depth);
wake_up(&xprt->sc_send_wait);
+ }
break;
}
return ret;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 81f0e879f019..534c178d2a7e 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = {
.data = &xprt_rdma_max_inline_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_inline_size,
.extra2 = &max_inline_size,
},
@@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = {
.data = &xprt_rdma_max_inline_write,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_inline_size,
.extra2 = &max_inline_size,
},
@@ -219,6 +219,34 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
}
}
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ep, struct rpcrdma_xprt, rx_ep);
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (++xprt->connect_cookie == 0) /* maintain a reserved value */
+ ++xprt->connect_cookie;
+ if (ep->rep_connected > 0) {
+ if (!xprt_test_and_set_connected(xprt))
+ xprt_wake_pending_tasks(xprt, 0);
+ } else {
+ if (xprt_test_and_clear_connected(xprt))
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
static void
xprt_rdma_connect_worker(struct work_struct *work)
{
@@ -477,115 +505,153 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
}
}
-/*
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
- * sequence.
+/* Allocate a fixed-size buffer in which to construct and send the
+ * RPC-over-RDMA header for this request.
+ */
+static bool
+rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ gfp_t flags)
+{
+ size_t size = RPCRDMA_HDRBUF_SIZE;
+ struct rpcrdma_regbuf *rb;
+
+ if (req->rl_rdmabuf)
+ return true;
+
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+ if (IS_ERR(rb))
+ return false;
+
+ r_xprt->rx_stats.hardway_register_count += size;
+ req->rl_rdmabuf = rb;
+ return true;
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ size_t size, gfp_t flags)
+{
+ struct rpcrdma_regbuf *rb;
+
+ if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+ return true;
+
+ rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+ if (IS_ERR(rb))
+ return false;
+
+ rpcrdma_free_regbuf(req->rl_sendbuf);
+ r_xprt->rx_stats.hardway_register_count += size;
+ req->rl_sendbuf = rb;
+ return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
+ *
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
*
- * The RPC layer allocates both send and receive buffers in the same call
- * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
- * We may register rq_rcv_buf when using reply chunks.
+ * A regbuf is used here to remember the buffer size.
*/
-static void *
-xprt_rdma_allocate(struct rpc_task *task, size_t size)
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ size_t size, gfp_t flags)
{
- struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_regbuf *rb;
+
+ if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+ return true;
+
+ rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+ if (IS_ERR(rb))
+ return false;
+
+ rpcrdma_free_regbuf(req->rl_recvbuf);
+ r_xprt->rx_stats.hardway_register_count += size;
+ req->rl_recvbuf = rb;
+ return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ * 0: Success; rq_buffer points to RPC buffer to use
+ * ENOMEM: Out of memory, call again later
+ * EIO: A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req;
- size_t min_size;
gfp_t flags;
req = rpcrdma_buffer_get(&r_xprt->rx_buf);
if (req == NULL)
- return NULL;
+ return -ENOMEM;
flags = RPCRDMA_DEF_GFP;
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
- if (req->rl_rdmabuf == NULL)
- goto out_rdmabuf;
- if (req->rl_sendbuf == NULL)
- goto out_sendbuf;
- if (size > req->rl_sendbuf->rg_size)
- goto out_sendbuf;
-
-out:
- dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
- req->rl_connect_cookie = 0; /* our reserved value */
- req->rl_task = task;
- return req->rl_sendbuf->rg_base;
-
-out_rdmabuf:
- min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
- rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
- if (IS_ERR(rb))
+ if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
goto out_fail;
- req->rl_rdmabuf = rb;
-
-out_sendbuf:
- /* XDR encoding and RPC/RDMA marshaling of this request has not
- * yet occurred. Thus a lower bound is needed to prevent buffer
- * overrun during marshaling.
- *
- * RPC/RDMA marshaling may choose to send payload bearing ops
- * inline, if the result is smaller than the inline threshold.
- * The value of the "size" argument accounts for header
- * requirements but not for the payload in these cases.
- *
- * Likewise, allocate enough space to receive a reply up to the
- * size of the inline threshold.
- *
- * It's unlikely that both the send header and the received
- * reply will be large, but slush is provided here to allow
- * flexibility when marshaling.
- */
- min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
- min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
- if (size < min_size)
- size = min_size;
-
- rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
- if (IS_ERR(rb))
+ if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+ goto out_fail;
+ if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
goto out_fail;
- rb->rg_owner = req;
- r_xprt->rx_stats.hardway_register_count += size;
- rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
- req->rl_sendbuf = rb;
- goto out;
+ dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
+ task->tk_pid, __func__, rqst->rq_callsize,
+ rqst->rq_rcvsize, req);
+
+ req->rl_connect_cookie = 0; /* our reserved value */
+ rpcrdma_set_xprtdata(rqst, req);
+ rqst->rq_buffer = req->rl_sendbuf->rg_base;
+ rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+ return 0;
out_fail:
rpcrdma_buffer_put(req);
- return NULL;
+ return -ENOMEM;
}
-/*
- * This function returns all RDMA resources to the pool.
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
*/
static void
-xprt_rdma_free(void *buffer)
+xprt_rdma_free(struct rpc_task *task)
{
- struct rpcrdma_req *req;
- struct rpcrdma_xprt *r_xprt;
- struct rpcrdma_regbuf *rb;
-
- if (buffer == NULL)
- return;
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
- req = rb->rg_owner;
if (req->rl_backchannel)
return;
- r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
-
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
- !RPC_IS_ASYNC(req->rl_task));
-
+ if (unlikely(!list_empty(&req->rl_registered)))
+ ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+ rpcrdma_unmap_sges(ia, req);
rpcrdma_buffer_put(req);
}
@@ -620,7 +686,8 @@ xprt_rdma_send_request(struct rpc_task *task)
int rc = 0;
/* On retransmit, remove any previously registered chunks */
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+ if (unlikely(!list_empty(&req->rl_registered)))
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
@@ -685,10 +752,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
- seq_printf(seq, "%lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %lu\n",
r_xprt->rx_stats.mrs_recovered,
r_xprt->rx_stats.mrs_orphaned,
- r_xprt->rx_stats.mrs_allocated);
+ r_xprt->rx_stats.mrs_allocated,
+ r_xprt->rx_stats.local_inv_needed);
}
static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 799cce6cbe45..11d07748f699 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -103,9 +103,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ib_event_msg(event->event),
- event->device->name, context);
+ pr_err("rpcrdma: %s on device %s ep %p\n",
+ ib_event_msg(event->event), event->device->name, context);
+
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
rpcrdma_conn_func(ep);
@@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
wc->status, wc->vendor_err);
}
-static void
-rpcrdma_receive_worker(struct work_struct *work)
-{
- struct rpcrdma_rep *rep =
- container_of(work, struct rpcrdma_rep, rr_work);
-
- rpcrdma_reply_handler(rep);
-}
-
/* Perform basic sanity checking to avoid using garbage
* to update the credit grant value.
*/
@@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
}
/**
- * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
* @cq: completion queue (ignored)
* @wc: completed WR
*
*/
static void
-rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
+rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
@@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
__func__, rep, wc->byte_len);
rep->rr_len = wc->byte_len;
+ rep->rr_wc_flags = wc->wc_flags;
+ rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
ib_dma_sync_single_for_cpu(rep->rr_device,
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
@@ -204,6 +198,36 @@ out_fail:
goto out_schedule;
}
+static void
+rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+ struct rdma_conn_param *param)
+{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ const struct rpcrdma_connect_private *pmsg = param->private_data;
+ unsigned int rsize, wsize;
+
+ /* Default settings for RPC-over-RDMA Version One */
+ r_xprt->rx_ia.ri_reminv_expected = false;
+ rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+ wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+ if (pmsg &&
+ pmsg->cp_magic == rpcrdma_cmp_magic &&
+ pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+ r_xprt->rx_ia.ri_reminv_expected = true;
+ rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+ wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+ }
+
+ if (rsize < cdata->inline_rsize)
+ cdata->inline_rsize = rsize;
+ if (wsize < cdata->inline_wsize)
+ cdata->inline_wsize = wsize;
+ dprintk("RPC: %s: max send %u, max recv %u\n",
+ __func__, cdata->inline_wsize, cdata->inline_rsize);
+ rpcrdma_set_max_header_sizes(r_xprt);
+}
+
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
" (%d initiator)\n",
__func__, attr->max_dest_rd_atomic,
attr->max_rd_atomic);
+ rpcrdma_update_connect_private(xprt, &event->param.conn);
goto connected;
case RDMA_CM_EVENT_CONNECT_ERROR:
connstate = -ENOTCONN;
@@ -306,6 +331,7 @@ static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
{
+ unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
struct rdma_cm_id *id;
int rc;
@@ -327,8 +353,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
__func__, rc);
goto out;
}
- wait_for_completion_interruptible_timeout(&ia->ri_done,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ if (rc < 0) {
+ dprintk("RPC: %s: wait() exited: %i\n",
+ __func__, rc);
+ goto out;
+ }
/* FIXME:
* Until xprtrdma supports DEVICE_REMOVAL, the provider must
@@ -351,8 +381,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
__func__, rc);
goto put;
}
- wait_for_completion_interruptible_timeout(&ia->ri_done,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ if (rc < 0) {
+ dprintk("RPC: %s: wait() exited: %i\n",
+ __func__, rc);
+ goto put;
+ }
rc = ia->ri_async_rc;
if (rc)
goto put;
@@ -387,7 +421,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
ia->ri_device = ia->ri_id->device;
- ia->ri_pd = ib_alloc_pd(ia->ri_device);
+ ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
@@ -454,11 +488,12 @@ int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
+ struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
struct ib_cq *sendcq, *recvcq;
unsigned int max_qp_wr;
int rc;
- if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
+ if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
dprintk("RPC: %s: insufficient sge's available\n",
__func__);
return -ENOMEM;
@@ -487,7 +522,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
- ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -506,7 +541,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0; /* always signal? */
- INIT_CQCOUNT(ep);
+ rpcrdma_init_cqcount(ep, 0);
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
@@ -536,9 +571,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Initialize cma parameters */
memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
- /* RPC/RDMA does not use private data */
- ep->rep_remote_cma.private_data = NULL;
- ep->rep_remote_cma.private_data_len = 0;
+ /* Prepare RDMA-CM private message */
+ pmsg->cp_magic = rpcrdma_cmp_magic;
+ pmsg->cp_version = RPCRDMA_CMP_VERSION;
+ pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+ ep->rep_remote_cma.private_data = pmsg;
+ ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
@@ -849,6 +889,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf;
INIT_LIST_HEAD(&req->rl_registered);
+ req->rl_send_wr.next = NULL;
+ req->rl_send_wr.wr_cqe = &req->rl_cqe;
+ req->rl_send_wr.sg_list = req->rl_send_sge;
+ req->rl_send_wr.opcode = IB_WR_SEND;
return req;
}
@@ -865,17 +909,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
- GFP_KERNEL);
+ rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+ DMA_FROM_DEVICE, GFP_KERNEL);
if (IS_ERR(rep->rr_rdmabuf)) {
rc = PTR_ERR(rep->rr_rdmabuf);
goto out_free;
}
rep->rr_device = ia->ri_device;
- rep->rr_cqe.done = rpcrdma_receive_wc;
+ rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
- INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
+ INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
+ rep->rr_recv_wr.next = NULL;
+ rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+ rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+ rep->rr_recv_wr.num_sge = 1;
return rep;
out_free:
@@ -966,17 +1014,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
}
static void
-rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
{
- rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+ rpcrdma_free_regbuf(rep->rr_rdmabuf);
kfree(rep);
}
void
-rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+rpcrdma_destroy_req(struct rpcrdma_req *req)
{
- rpcrdma_free_regbuf(ia, req->rl_sendbuf);
- rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+ rpcrdma_free_regbuf(req->rl_recvbuf);
+ rpcrdma_free_regbuf(req->rl_sendbuf);
+ rpcrdma_free_regbuf(req->rl_rdmabuf);
kfree(req);
}
@@ -1009,15 +1058,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-
cancel_delayed_work_sync(&buf->rb_recovery_worker);
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
rep = rpcrdma_buffer_get_rep_locked(buf);
- rpcrdma_destroy_rep(ia, rep);
+ rpcrdma_destroy_rep(rep);
}
buf->rb_send_count = 0;
@@ -1030,7 +1077,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
list_del(&req->rl_all);
spin_unlock(&buf->rb_reqslock);
- rpcrdma_destroy_req(ia, req);
+ rpcrdma_destroy_req(req);
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
@@ -1129,7 +1176,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_rep *rep = req->rl_reply;
- req->rl_niovs = 0;
+ req->rl_send_wr.num_sge = 0;
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
@@ -1171,70 +1218,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
spin_unlock(&buffers->rb_lock);
}
-/*
- * Wrappers for internal-use kmalloc memory registration, used by buffer code.
- */
-
/**
- * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
- * @ia: controlling rpcrdma_ia
+ * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
* @size: size of buffer to be allocated, in bytes
+ * @direction: direction of data movement
* @flags: GFP flags
*
- * Returns pointer to private header of an area of internally
- * registered memory, or an ERR_PTR. The registered buffer follows
- * the end of the private header.
+ * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+ * can be persistently DMA-mapped for I/O.
*
* xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
- * receiving the payload of RDMA RECV operations. regbufs are not
- * used for RDMA READ/WRITE operations, thus are registered only for
- * LOCAL access.
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via ro_map.
*/
struct rpcrdma_regbuf *
-rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+ gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- struct ib_sge *iov;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
- goto out;
+ return ERR_PTR(-ENOMEM);
- iov = &rb->rg_iov;
- iov->addr = ib_dma_map_single(ia->ri_device,
- (void *)rb->rg_base, size,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_device, iov->addr))
- goto out_free;
+ rb->rg_device = NULL;
+ rb->rg_direction = direction;
+ rb->rg_iov.length = size;
- iov->length = size;
- iov->lkey = ia->ri_pd->local_dma_lkey;
- rb->rg_size = size;
- rb->rg_owner = NULL;
return rb;
+}
-out_free:
- kfree(rb);
-out:
- return ERR_PTR(-ENOMEM);
+/**
+ * __rpcrdma_map_regbuf - DMA-map a regbuf
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be mapped
+ */
+bool
+__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+ if (rb->rg_direction == DMA_NONE)
+ return false;
+
+ rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base,
+ rdmab_length(rb),
+ rb->rg_direction);
+ if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+ return false;
+
+ rb->rg_device = ia->ri_device;
+ rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+ return true;
+}
+
+static void
+rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+{
+ if (!rpcrdma_regbuf_is_mapped(rb))
+ return;
+
+ ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+ rdmab_length(rb), rb->rg_direction);
+ rb->rg_device = NULL;
}
/**
* rpcrdma_free_regbuf - deregister and free registered buffer
- * @ia: controlling rpcrdma_ia
* @rb: regbuf to be deregistered and freed
*/
void
-rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
{
- struct ib_sge *iov;
-
if (!rb)
return;
- iov = &rb->rg_iov;
- ib_dma_unmap_single(ia->ri_device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ rpcrdma_dma_unmap_regbuf(rb);
kfree(rb);
}
@@ -1248,39 +1306,22 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
- struct ib_device *device = ia->ri_device;
- struct ib_send_wr send_wr, *send_wr_fail;
- struct rpcrdma_rep *rep = req->rl_reply;
- struct ib_sge *iov = req->rl_send_iov;
- int i, rc;
+ struct ib_send_wr *send_wr = &req->rl_send_wr;
+ struct ib_send_wr *send_wr_fail;
+ int rc;
- if (rep) {
- rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ if (req->rl_reply) {
+ rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
if (rc)
return rc;
req->rl_reply = NULL;
}
- send_wr.next = NULL;
- send_wr.wr_cqe = &req->rl_cqe;
- send_wr.sg_list = iov;
- send_wr.num_sge = req->rl_niovs;
- send_wr.opcode = IB_WR_SEND;
-
- for (i = 0; i < send_wr.num_sge; i++)
- ib_dma_sync_single_for_device(device, iov[i].addr,
- iov[i].length, DMA_TO_DEVICE);
dprintk("RPC: %s: posting %d s/g entries\n",
- __func__, send_wr.num_sge);
-
- if (DECR_CQCOUNT(ep) > 0)
- send_wr.send_flags = 0;
- else { /* Provider must take a send completion every now and then */
- INIT_CQCOUNT(ep);
- send_wr.send_flags = IB_SEND_SIGNALED;
- }
+ __func__, send_wr->num_sge);
- rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+ rpcrdma_set_signaled(ep, send_wr);
+ rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
if (rc)
goto out_postsend_err;
return 0;
@@ -1290,32 +1331,24 @@ out_postsend_err:
return -ENOTCONN;
}
-/*
- * (Re)post a receive buffer.
- */
int
rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
struct rpcrdma_rep *rep)
{
- struct ib_recv_wr recv_wr, *recv_wr_fail;
+ struct ib_recv_wr *recv_wr_fail;
int rc;
- recv_wr.next = NULL;
- recv_wr.wr_cqe = &rep->rr_cqe;
- recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
- recv_wr.num_sge = 1;
-
- ib_dma_sync_single_for_cpu(ia->ri_device,
- rdmab_addr(rep->rr_rdmabuf),
- rdmab_length(rep->rr_rdmabuf),
- DMA_BIDIRECTIONAL);
-
- rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+ if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+ goto out_map;
+ rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
if (rc)
goto out_postrecv;
return 0;
+out_map:
+ pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+ return -EIO;
+
out_postrecv:
pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
return -ENOTCONN;
@@ -1333,7 +1366,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
{
struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
struct rpcrdma_rep *rep;
int rc;
@@ -1344,7 +1376,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
rep = rpcrdma_buffer_get_rep_locked(buffers);
spin_unlock(&buffers->rb_lock);
- rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ rc = rpcrdma_ep_post_recv(ia, rep);
if (rc)
goto out_rc;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a71b0f5897d8..e35efd4ac1e4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,9 +70,12 @@ struct rpcrdma_ia {
struct ib_pd *ri_pd;
struct completion ri_done;
int ri_async_rc;
+ unsigned int ri_max_segs;
unsigned int ri_max_frmr_depth;
unsigned int ri_max_inline_write;
unsigned int ri_max_inline_read;
+ bool ri_reminv_expected;
+ enum ib_mr_type ri_mrtype;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -87,13 +90,30 @@ struct rpcrdma_ep {
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
+ struct rpcrdma_connect_private rep_cm_private;
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
};
-#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
-#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+static inline void
+rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count)
+{
+ atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count);
+}
+
+/* To update send queue accounting, provider must take a
+ * send completion every now and then.
+ */
+static inline void
+rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr)
+{
+ send_wr->send_flags = 0;
+ if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) {
+ rpcrdma_init_cqcount(ep, 0);
+ send_wr->send_flags = IB_SEND_SIGNALED;
+ }
+}
/* Pre-allocate extra Work Requests for handling backward receives
* and sends. This is a fixed value because the Work Queues are
@@ -112,9 +132,9 @@ struct rpcrdma_ep {
*/
struct rpcrdma_regbuf {
- size_t rg_size;
- struct rpcrdma_req *rg_owner;
struct ib_sge rg_iov;
+ struct ib_device *rg_device;
+ enum dma_data_direction rg_direction;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
@@ -162,7 +182,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* The smallest inline threshold is 1024 bytes, ensuring that
* at least 750 bytes are available for RPC messages.
*/
-#define RPCRDMA_MAX_HDR_SEGS (8)
+enum {
+ RPCRDMA_MAX_HDR_SEGS = 8,
+ RPCRDMA_HDRBUF_SIZE = 256,
+};
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
@@ -182,10 +205,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
unsigned int rr_len;
+ int rr_wc_flags;
+ u32 rr_inv_rkey;
struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
struct list_head rr_list;
+ struct ib_recv_wr rr_recv_wr;
struct rpcrdma_regbuf *rr_rdmabuf;
};
@@ -207,7 +233,8 @@ struct rpcrdma_rep {
enum rpcrdma_frmr_state {
FRMR_IS_INVALID, /* ready to be used */
FRMR_IS_VALID, /* in use */
- FRMR_IS_STALE, /* failed completion */
+ FRMR_FLUSHED_FR, /* flushed FASTREG WR */
+ FRMR_FLUSHED_LI, /* flushed LOCALINV WR */
};
struct rpcrdma_frmr {
@@ -276,19 +303,30 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
-#define RPCRDMA_MAX_IOVS (2)
+/* Reserve enough Send SGEs to send a maximum size inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - xdr_buf tail iovec
+ */
+enum {
+ RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
+ RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+ RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_free;
- unsigned int rl_niovs;
+ unsigned int rl_mapped_sges;
unsigned int rl_connect_cookie;
- struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
- struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_rep *rl_reply;
+ struct ib_send_wr rl_send_wr;
+ struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES];
+ struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
+ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
+ struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
struct ib_cqe rl_cqe;
struct list_head rl_all;
@@ -298,14 +336,16 @@ struct rpcrdma_req {
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
+static inline void
+rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
+{
+ rqst->rq_xprtdata = req;
+}
+
static inline struct rpcrdma_req *
rpcr_to_rdmar(struct rpc_rqst *rqst)
{
- void *buffer = rqst->rq_buffer;
- struct rpcrdma_regbuf *rb;
-
- rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
- return rb->rg_owner;
+ return rqst->rq_xprtdata;
}
/*
@@ -356,15 +396,6 @@ struct rpcrdma_create_data_internal {
unsigned int padding; /* non-rdma write header padding */
};
-#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
- (rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
-
-#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
- (rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
-
-#define RPCRDMA_INLINE_PAD_VALUE(rq)\
- rpcx_to_rdmad(rq->rq_xprt).padding
-
/*
* Statistics for RPCRDMA
*/
@@ -386,6 +417,7 @@ struct rpcrdma_stats {
unsigned long mrs_recovered;
unsigned long mrs_orphaned;
unsigned long mrs_allocated;
+ unsigned long local_inv_needed;
};
/*
@@ -409,6 +441,7 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_mw *);
void (*ro_release_mr)(struct rpcrdma_mw *);
const char *ro_displayname;
+ const int ro_send_w_inv_ok;
};
extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
@@ -457,19 +490,19 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_conn_func(struct rpcrdma_ep *ep);
void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
struct rpcrdma_req *);
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_rep *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
/*
* Buffer calls - xprtrdma/verbs.c
*/
struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
-void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
+void rpcrdma_destroy_req(struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
@@ -482,10 +515,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
-struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
- size_t, gfp_t);
-void rpcrdma_free_regbuf(struct rpcrdma_ia *,
- struct rpcrdma_regbuf *);
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
+ gfp_t);
+bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
+void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+
+static inline bool
+rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_device != NULL;
+}
+
+static inline bool
+rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+ if (likely(rpcrdma_regbuf_is_mapped(rb)))
+ return true;
+ return __rpcrdma_dma_map_regbuf(ia, rb);
+}
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
@@ -503,25 +550,30 @@ rpcrdma_data_dir(bool writing)
}
/*
- * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
- */
-void rpcrdma_connect_worker(struct work_struct *);
-void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct rpcrdma_rep *);
-
-/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
+
+enum rpcrdma_chunktype {
+ rpcrdma_noch = 0,
+ rpcrdma_readch,
+ rpcrdma_areadch,
+ rpcrdma_writech,
+ rpcrdma_replych
+};
+
+bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
+ u32, struct xdr_buf *, enum rpcrdma_chunktype);
+void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
int rpcrdma_marshal_req(struct rpc_rqst *);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
- struct rpcrdma_create_data_internal *,
- unsigned int);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_reply_handler(struct work_struct *work);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
extern unsigned int xprt_rdma_max_inline_read;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void rpcrdma_connect_worker(struct work_struct *work);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf168838a029..af392d9b9cec 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
/* Race breaker in case memory is freed before above code is called */
- sk->sk_write_space(sk);
+ if (ret == -EAGAIN) {
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
+ rcu_read_unlock();
+
+ sk->sk_write_space(sk);
+ }
return ret;
}
@@ -1071,10 +1080,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
if (sk == NULL)
goto out;
for (;;) {
- skb = skb_recv_datagram(sk, 0, 1, &err);
+ skb = skb_recv_udp(sk, 0, 1, &err);
if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram_locked(sk, skb);
+ consume_skb(skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
@@ -2533,35 +2542,39 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
* we allocate pages instead doing a kmalloc like rpc_malloc is because we want
* to use the server side send routines.
*/
-static void *bc_malloc(struct rpc_task *task, size_t size)
+static int bc_malloc(struct rpc_task *task)
{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ size_t size = rqst->rq_callsize;
struct page *page;
struct rpc_buffer *buf;
- WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
- if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
- return NULL;
+ if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) {
+ WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n",
+ size);
+ return -EINVAL;
+ }
page = alloc_page(GFP_KERNEL);
if (!page)
- return NULL;
+ return -ENOMEM;
buf = page_address(page);
buf->len = PAGE_SIZE;
- return buf->data;
+ rqst->rq_buffer = buf->data;
+ rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+ return 0;
}
/*
* Free the space allocated in the bc_alloc routine
*/
-static void bc_free(void *buffer)
+static void bc_free(struct rpc_task *task)
{
+ void *buffer = task->tk_rqstp->rq_buffer;
struct rpc_buffer *buf;
- if (!buffer)
- return;
-
buf = container_of(buffer, struct rpc_buffer, data);
free_page((unsigned long)buf);
}
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index a5fc9dd24aa9..017801f9dbaa 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -21,7 +21,6 @@
#include <linux/workqueue.h>
#include <linux/if_vlan.h>
#include <linux/rtnetlink.h>
-#include <net/ip_fib.h>
#include <net/switchdev.h>
/**
@@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_VLAN:
return sizeof(struct switchdev_obj_port_vlan);
- case SWITCHDEV_OBJ_ID_IPV4_FIB:
- return sizeof(struct switchdev_obj_ipv4_fib);
case SWITCHDEV_OBJ_ID_PORT_FDB:
return sizeof(struct switchdev_obj_port_fdb);
case SWITCHDEV_OBJ_ID_PORT_MDB:
@@ -627,13 +624,10 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
struct switchdev_notifier_info *info)
{
- int err;
-
ASSERT_RTNL();
info->dev = dev;
- err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
- return err;
+ return raw_notifier_call_chain(&switchdev_notif_chain, val, info);
}
EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
@@ -774,6 +768,9 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
u32 mask = BR_LEARNING | BR_LEARNING_SYNC | BR_FLOOD;
int err;
+ if (!netif_is_bridge_port(dev))
+ return -EOPNOTSUPP;
+
err = switchdev_port_attr_get(dev, &attr);
if (err && err != -EOPNOTSUPP)
return err;
@@ -929,6 +926,9 @@ int switchdev_port_bridge_setlink(struct net_device *dev,
struct nlattr *afspec;
int err = 0;
+ if (!netif_is_bridge_port(dev))
+ return -EOPNOTSUPP;
+
protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
IFLA_PROTINFO);
if (protinfo) {
@@ -962,6 +962,9 @@ int switchdev_port_bridge_dellink(struct net_device *dev,
{
struct nlattr *afspec;
+ if (!netif_is_bridge_port(dev))
+ return -EOPNOTSUPP;
+
afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
IFLA_AF_SPEC);
if (afspec)
@@ -1042,7 +1045,7 @@ static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj)
struct nlmsghdr *nlh;
struct ndmsg *ndm;
- if (dump->idx < dump->cb->args[0])
+ if (dump->idx < dump->cb->args[2])
goto skip;
nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
@@ -1089,7 +1092,7 @@ nla_put_failure:
*/
int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev,
- struct net_device *filter_dev, int idx)
+ struct net_device *filter_dev, int *idx)
{
struct switchdev_fdb_dump dump = {
.fdb.obj.orig_dev = dev,
@@ -1097,207 +1100,27 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
.dev = dev,
.skb = skb,
.cb = cb,
- .idx = idx,
+ .idx = *idx,
};
int err;
err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
switchdev_port_fdb_dump_cb);
- cb->args[1] = err;
- return dump.idx;
+ *idx = dump.idx;
+ return err;
}
EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
-static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
-{
- const struct switchdev_ops *ops = dev->switchdev_ops;
- struct net_device *lower_dev;
- struct net_device *port_dev;
- struct list_head *iter;
-
- /* Recusively search down until we find a sw port dev.
- * (A sw port dev supports switchdev_port_attr_get).
- */
-
- if (ops && ops->switchdev_port_attr_get)
- return dev;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- port_dev = switchdev_get_lowest_dev(lower_dev);
- if (port_dev)
- return port_dev;
- }
-
- return NULL;
-}
-
-static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
-{
- struct switchdev_attr attr = {
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- };
- struct switchdev_attr prev_attr;
- struct net_device *dev = NULL;
- int nhsel;
-
- ASSERT_RTNL();
-
- /* For this route, all nexthop devs must be on the same switch. */
-
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (!nh->nh_dev)
- return NULL;
-
- dev = switchdev_get_lowest_dev(nh->nh_dev);
- if (!dev)
- return NULL;
-
- attr.orig_dev = dev;
- if (switchdev_port_attr_get(dev, &attr))
- return NULL;
-
- if (nhsel > 0 &&
- !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
- return NULL;
-
- prev_attr = attr;
- }
-
- return dev;
-}
-
-/**
- * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry
- *
- * @dst: route's IPv4 destination address
- * @dst_len: destination address length (prefix length)
- * @fi: route FIB info structure
- * @tos: route TOS
- * @type: route type
- * @nlflags: netlink flags passed in (NLM_F_*)
- * @tb_id: route table ID
- *
- * Add/modify switch IPv4 route entry.
- */
-int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 nlflags, u32 tb_id)
-{
- struct switchdev_obj_ipv4_fib ipv4_fib = {
- .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
- .dst = dst,
- .dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .nlflags = nlflags,
- .tb_id = tb_id,
- };
- struct net_device *dev;
- int err = 0;
-
- /* Don't offload route if using custom ip rules or if
- * IPv4 FIB offloading has been disabled completely.
- */
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (fi->fib_net->ipv4.fib_has_custom_rules)
- return 0;
-#endif
-
- if (fi->fib_net->ipv4.fib_offload_disabled)
- return 0;
-
- dev = switchdev_get_dev_by_nhs(fi);
- if (!dev)
- return 0;
-
- ipv4_fib.obj.orig_dev = dev;
- err = switchdev_port_obj_add(dev, &ipv4_fib.obj);
- if (!err)
- fi->fib_flags |= RTNH_F_OFFLOAD;
-
- return err == -EOPNOTSUPP ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
-
-/**
- * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
- *
- * @dst: route's IPv4 destination address
- * @dst_len: destination address length (prefix length)
- * @fi: route FIB info structure
- * @tos: route TOS
- * @type: route type
- * @tb_id: route table ID
- *
- * Delete IPv4 route entry from switch device.
- */
-int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id)
-{
- struct switchdev_obj_ipv4_fib ipv4_fib = {
- .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
- .dst = dst,
- .dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .nlflags = 0,
- .tb_id = tb_id,
- };
- struct net_device *dev;
- int err = 0;
-
- if (!(fi->fib_flags & RTNH_F_OFFLOAD))
- return 0;
-
- dev = switchdev_get_dev_by_nhs(fi);
- if (!dev)
- return 0;
-
- ipv4_fib.obj.orig_dev = dev;
- err = switchdev_port_obj_del(dev, &ipv4_fib.obj);
- if (!err)
- fi->fib_flags &= ~RTNH_F_OFFLOAD;
-
- return err == -EOPNOTSUPP ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
-
-/**
- * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
- *
- * @fi: route FIB info structure
- */
-void switchdev_fib_ipv4_abort(struct fib_info *fi)
-{
- /* There was a problem installing this route to the offload
- * device. For now, until we come up with more refined
- * policy handling, abruptly end IPv4 fib offloading for
- * for entire net by flushing offload device(s) of all
- * IPv4 routes, and mark IPv4 fib offloading broken from
- * this point forward.
- */
-
- fib_flush_external(fi->fib_net);
- fi->fib_net->ipv4.fib_offload_disabled = true;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
-
bool switchdev_port_same_parent_id(struct net_device *a,
struct net_device *b)
{
struct switchdev_attr a_attr = {
.orig_dev = a,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- .flags = SWITCHDEV_F_NO_RECURSE,
};
struct switchdev_attr b_attr = {
.orig_dev = b,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- .flags = SWITCHDEV_F_NO_RECURSE,
};
if (switchdev_port_attr_get(a, &a_attr) ||
@@ -1306,89 +1129,4 @@ bool switchdev_port_same_parent_id(struct net_device *a,
return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
}
-
-static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
- struct net_device *group_dev)
-{
- struct net_device *lower_dev;
- struct list_head *iter;
-
- netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
- if (lower_dev == dev)
- continue;
- if (switchdev_port_same_parent_id(dev, lower_dev))
- return lower_dev->offload_fwd_mark;
- return switchdev_port_fwd_mark_get(dev, lower_dev);
- }
-
- return dev->ifindex;
-}
EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id);
-
-static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
- u32 old_mark, u32 *reset_mark)
-{
- struct net_device *lower_dev;
- struct list_head *iter;
-
- netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
- if (lower_dev->offload_fwd_mark == old_mark) {
- if (!*reset_mark)
- *reset_mark = lower_dev->ifindex;
- lower_dev->offload_fwd_mark = *reset_mark;
- }
- switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
- }
-}
-
-/**
- * switchdev_port_fwd_mark_set - Set port offload forwarding mark
- *
- * @dev: port device
- * @group_dev: containing device
- * @joining: true if dev is joining group; false if leaving group
- *
- * An ungrouped port's offload mark is just its ifindex. A grouped
- * port's (member of a bridge, for example) offload mark is the ifindex
- * of one of the ports in the group with the same parent (switch) ID.
- * Ports on the same device in the same group will have the same mark.
- *
- * Example:
- *
- * br0 ifindex=9
- * sw1p1 ifindex=2 mark=2
- * sw1p2 ifindex=3 mark=2
- * sw2p1 ifindex=4 mark=5
- * sw2p2 ifindex=5 mark=5
- *
- * If sw2p2 leaves the bridge, we'll have:
- *
- * br0 ifindex=9
- * sw1p1 ifindex=2 mark=2
- * sw1p2 ifindex=3 mark=2
- * sw2p1 ifindex=4 mark=4
- * sw2p2 ifindex=5 mark=5
- */
-void switchdev_port_fwd_mark_set(struct net_device *dev,
- struct net_device *group_dev,
- bool joining)
-{
- u32 mark = dev->ifindex;
- u32 reset_mark = 0;
-
- if (group_dev) {
- ASSERT_RTNL();
- if (joining)
- mark = switchdev_port_fwd_mark_get(dev, group_dev);
- else if (dev->offload_fwd_mark == mark)
- /* Ohoh, this port was the mark reference port,
- * but it's leaving the group, so reset the
- * mark for the remaining ports in the group.
- */
- switchdev_port_fwd_mark_reset(group_dev, mark,
- &reset_mark);
- }
-
- dev->offload_fwd_mark = mark;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 46a71c701e7c..919981324171 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -27,9 +27,9 @@
#endif
static struct ctl_table_set *
-net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+net_ctl_header_lookup(struct ctl_table_root *root)
{
- return &namespaces->net_ns->sysctls;
+ return &current->nsproxy->net_ns->sysctls;
}
static int is_seen(struct ctl_table_set *set)
@@ -42,26 +42,37 @@ static int net_ctl_permissions(struct ctl_table_header *head,
struct ctl_table *table)
{
struct net *net = container_of(head->set, struct net, sysctls);
- kuid_t root_uid = make_kuid(net->user_ns, 0);
- kgid_t root_gid = make_kgid(net->user_ns, 0);
/* Allow network administrator to have same access as root. */
- if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) ||
- uid_eq(root_uid, current_euid())) {
+ if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) {
int mode = (table->mode >> 6) & 7;
return (mode << 6) | (mode << 3) | mode;
}
- /* Allow netns root group to have the same access as the root group */
- if (in_egroup_p(root_gid)) {
- int mode = (table->mode >> 3) & 7;
- return (mode << 3) | mode;
- }
+
return table->mode;
}
+static void net_ctl_set_ownership(struct ctl_table_header *head,
+ struct ctl_table *table,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct net *net = container_of(head->set, struct net, sysctls);
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(net->user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(net->user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
static struct ctl_table_root net_sysctl_root = {
.lookup = net_ctl_header_lookup,
.permissions = net_ctl_permissions,
+ .set_ownership = net_ctl_set_ownership,
};
static int __net_init sysctl_net_init(struct net *net)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index ae469b37d852..aa1babbea385 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -247,11 +247,17 @@ int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb)
*
* RCU is locked, no other locks set
*/
-void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked)
+void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
+ struct tipc_msg *hdr)
{
struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
+ u16 acked = msg_bcast_ack(hdr);
struct sk_buff_head xmitq;
+ /* Ignore bc acks sent by peer before bcast synch point was received */
+ if (msg_bc_ack_invalid(hdr))
+ return;
+
__skb_queue_head_init(&xmitq);
tipc_bcast_lock(net);
@@ -269,20 +275,21 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked)
*
* RCU is locked, no other locks set
*/
-void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
- struct tipc_msg *hdr)
+int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+ struct tipc_msg *hdr)
{
struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
struct sk_buff_head xmitq;
+ int rc = 0;
__skb_queue_head_init(&xmitq);
tipc_bcast_lock(net);
- if (msg_type(hdr) == STATE_MSG) {
- tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq);
- tipc_link_bc_sync_rcv(l, hdr, &xmitq);
- } else {
+ if (msg_type(hdr) != STATE_MSG) {
tipc_link_bc_init_rcv(l, hdr);
+ } else if (!msg_bc_ack_invalid(hdr)) {
+ tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq);
+ rc = tipc_link_bc_sync_rcv(l, hdr, &xmitq);
}
tipc_bcast_unlock(net);
@@ -291,6 +298,7 @@ void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
/* Any socket wakeup messages ? */
if (!skb_queue_empty(inputq))
tipc_sk_rcv(net, inputq);
+ return rc;
}
/* tipc_bcast_add_peer - add a peer node to broadcast link and bearer
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index d5e79b3767fd..855d53c64ab3 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -55,9 +55,10 @@ void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
int tipc_bcast_get_mtu(struct net *net);
int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
-void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked);
-void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
- struct tipc_msg *hdr);
+void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
+ struct tipc_msg *hdr);
+int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+ struct tipc_msg *hdr);
int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
int tipc_bclink_reset_stats(struct net *net);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 65b1bbf133bd..52d74760fb68 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -42,6 +42,7 @@
#include "monitor.h"
#include "bcast.h"
#include "netlink.h"
+#include "udp_media.h"
#define MAX_ADDR_STR 60
@@ -56,6 +57,13 @@ static struct tipc_media * const media_info_array[] = {
NULL
};
+static struct tipc_bearer *bearer_get(struct net *net, int bearer_id)
+{
+ struct tipc_net *tn = tipc_net(net);
+
+ return rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+}
+
static void bearer_disable(struct net *net, struct tipc_bearer *b);
/**
@@ -323,6 +331,7 @@ restart:
b->domain = disc_domain;
b->net_plane = bearer_id + 'A';
b->priority = priority;
+ test_and_set_bit_lock(0, &b->up);
res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
if (res) {
@@ -360,15 +369,24 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
*/
void tipc_bearer_reset_all(struct net *net)
{
- struct tipc_net *tn = tipc_net(net);
struct tipc_bearer *b;
int i;
for (i = 0; i < MAX_BEARERS; i++) {
- b = rcu_dereference_rtnl(tn->bearer_list[i]);
+ b = bearer_get(net, i);
+ if (b)
+ clear_bit_unlock(0, &b->up);
+ }
+ for (i = 0; i < MAX_BEARERS; i++) {
+ b = bearer_get(net, i);
if (b)
tipc_reset_bearer(net, b);
}
+ for (i = 0; i < MAX_BEARERS; i++) {
+ b = bearer_get(net, i);
+ if (b)
+ test_and_set_bit_lock(0, &b->up);
+ }
}
/**
@@ -382,8 +400,9 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
int bearer_id = b->identity;
pr_info("Disabling bearer <%s>\n", b->name);
- b->media->disable_media(b);
+ clear_bit_unlock(0, &b->up);
tipc_node_delete_links(net, bearer_id);
+ b->media->disable_media(b);
RCU_INIT_POINTER(b->media_ptr, NULL);
if (b->link_req)
tipc_disc_delete(b->link_req);
@@ -402,6 +421,10 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
dev = dev_get_by_name(net, driver_name);
if (!dev)
return -ENODEV;
+ if (tipc_mtu_bad(dev, 0)) {
+ dev_put(dev);
+ return -EINVAL;
+ }
/* Associate TIPC bearer with L2 bearer */
rcu_assign_pointer(b->media_ptr, dev);
@@ -440,22 +463,16 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
{
struct net_device *dev;
int delta;
- void *tipc_ptr;
dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr);
if (!dev)
return 0;
- /* Send RESET message even if bearer is detached from device */
- tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr);
- if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb))))
- goto drop;
-
- delta = dev->hard_header_len - skb_headroom(skb);
- if ((delta > 0) &&
- pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
- goto drop;
-
+ delta = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb));
+ if ((delta > 0) && pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {
+ kfree_skb(skb);
+ return 0;
+ }
skb_reset_network_header(skb);
skb->dev = dev;
skb->protocol = htons(ETH_P_TIPC);
@@ -463,9 +480,6 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
dev->dev_addr, skb->len);
dev_queue_xmit(skb);
return 0;
-drop:
- kfree_skb(skb);
- return 0;
}
int tipc_bearer_mtu(struct net *net, u32 bearer_id)
@@ -487,12 +501,12 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
struct sk_buff *skb,
struct tipc_media_addr *dest)
{
- struct tipc_net *tn = tipc_net(net);
+ struct tipc_msg *hdr = buf_msg(skb);
struct tipc_bearer *b;
rcu_read_lock();
- b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (likely(b))
+ b = bearer_get(net, bearer_id);
+ if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr))))
b->media->send_msg(net, skb, b, dest);
else
kfree_skb(skb);
@@ -505,7 +519,6 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq,
struct tipc_media_addr *dst)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_bearer *b;
struct sk_buff *skb, *tmp;
@@ -513,12 +526,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
return;
rcu_read_lock();
- b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ b = bearer_get(net, bearer_id);
if (unlikely(!b))
__skb_queue_purge(xmitq);
skb_queue_walk_safe(xmitq, skb, tmp) {
__skb_dequeue(xmitq);
- b->media->send_msg(net, skb, b, dst);
+ if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb))))
+ b->media->send_msg(net, skb, b, dst);
+ else
+ kfree_skb(skb);
}
rcu_read_unlock();
}
@@ -535,8 +551,8 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
struct tipc_msg *hdr;
rcu_read_lock();
- b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (unlikely(!b))
+ b = bearer_get(net, bearer_id);
+ if (unlikely(!b || !test_bit(0, &b->up)))
__skb_queue_purge(xmitq);
skb_queue_walk_safe(xmitq, skb, tmp) {
hdr = buf_msg(skb);
@@ -566,7 +582,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
rcu_read_lock();
b = rcu_dereference_rtnl(dev->tipc_ptr);
- if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) {
+ if (likely(b && test_bit(0, &b->up) &&
+ (skb->pkt_type <= PACKET_BROADCAST))) {
skb->next = NULL;
tipc_rcv(dev_net(dev), skb, b);
rcu_read_unlock();
@@ -591,36 +608,29 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
- struct tipc_net *tn = tipc_net(net);
struct tipc_bearer *b;
- int i;
b = rtnl_dereference(dev->tipc_ptr);
- if (!b) {
- for (i = 0; i < MAX_BEARERS; b = NULL, i++) {
- b = rtnl_dereference(tn->bearer_list[i]);
- if (b && (b->media_ptr == dev))
- break;
- }
- }
if (!b)
return NOTIFY_DONE;
- b->mtu = dev->mtu;
-
switch (evt) {
case NETDEV_CHANGE:
if (netif_carrier_ok(dev))
break;
case NETDEV_UP:
- rcu_assign_pointer(dev->tipc_ptr, b);
+ test_and_set_bit_lock(0, &b->up);
break;
case NETDEV_GOING_DOWN:
- RCU_INIT_POINTER(dev->tipc_ptr, NULL);
- synchronize_net();
+ clear_bit_unlock(0, &b->up);
tipc_reset_bearer(net, b);
break;
case NETDEV_CHANGEMTU:
+ if (tipc_mtu_bad(dev, 0)) {
+ bearer_disable(net, b);
+ break;
+ }
+ b->mtu = dev->mtu;
tipc_reset_bearer(net, b);
break;
case NETDEV_CHANGEADDR:
@@ -709,6 +719,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
goto prop_msg_full;
nla_nest_end(msg->skb, prop);
+
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) {
+ if (tipc_udp_nl_add_bearer_data(msg, bearer))
+ goto attr_msg_full;
+ }
+#endif
+
nla_nest_end(msg->skb, attrs);
genlmsg_end(msg->skb, hdr);
@@ -895,6 +913,49 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info)
return 0;
}
+int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *name;
+ struct tipc_bearer *b;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+
+ if (!info->attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
+ info->attrs[TIPC_NLA_BEARER],
+ tipc_nl_bearer_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_BEARER_NAME])
+ return -EINVAL;
+ name = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
+
+ rtnl_lock();
+ b = tipc_bearer_find(net, name);
+ if (!b) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) {
+ err = tipc_udp_nl_bearer_add(b,
+ attrs[TIPC_NLA_BEARER_UDP_OPTS]);
+ if (err) {
+ rtnl_unlock();
+ return err;
+ }
+ }
+#endif
+ rtnl_unlock();
+
+ return 0;
+}
+
int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
{
int err;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 43757f1f9cb3..278ff7f616f9 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -39,6 +39,7 @@
#include "netlink.h"
#include "core.h"
+#include "msg.h"
#include <net/genetlink.h>
#define MAX_MEDIA 3
@@ -59,6 +60,9 @@
#define TIPC_MEDIA_TYPE_IB 2
#define TIPC_MEDIA_TYPE_UDP 3
+/* minimum bearer MTU */
+#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE)
+
/**
* struct tipc_media_addr - destination address used by TIPC bearers
* @value: address info (format defined by media)
@@ -150,6 +154,7 @@ struct tipc_bearer {
u32 identity;
struct tipc_link_req *link_req;
char net_plane;
+ unsigned long up;
};
struct tipc_bearer_names {
@@ -180,6 +185,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info);
@@ -213,4 +219,13 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq);
+/* check if device MTU is too low for tipc headers */
+static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve)
+{
+ if (dev->mtu >= TIPC_MIN_BEARER_MTU + reserve)
+ return false;
+ netdev_warn(dev, "MTU too low for tipc bearer\n");
+ return true;
+}
+
#endif /* _TIPC_BEARER_H */
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 236b043a4156..0b982d048fb9 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -47,7 +47,7 @@
#include <linux/module.h>
/* configurable TIPC parameters */
-int tipc_net_id __read_mostly;
+unsigned int tipc_net_id __read_mostly;
int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */
static int __net_init tipc_init_net(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index a1845fb27d80..5cc5398be722 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -74,7 +74,7 @@ struct tipc_monitor;
#define MAX_BEARERS 3
#define TIPC_DEF_MON_THRESHOLD 32
-extern int tipc_net_id __read_mostly;
+extern unsigned int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
extern int sysctl_tipc_named_timeout __read_mostly;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 6b109a808d4c..02462d67d191 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -169,7 +169,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
/* Send response, if necessary */
if (respond && (mtyp == DSC_REQ_MSG)) {
- rskb = tipc_buf_acquire(MAX_H_SIZE);
+ rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
if (!rskb)
return;
tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer);
@@ -278,7 +278,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b,
req = kmalloc(sizeof(*req), GFP_ATOMIC);
if (!req)
return -ENOMEM;
- req->buf = tipc_buf_acquire(MAX_H_SIZE);
+ req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
if (!req->buf) {
kfree(req);
return -ENOMEM;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 877d94f34814..4e8647aef01c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -47,8 +47,8 @@
#include <linux/pkt_sched.h>
struct tipc_stats {
- u32 sent_info; /* used in counting # sent packets */
- u32 recv_info; /* used in counting # recv'd packets */
+ u32 sent_pkts;
+ u32 recv_pkts;
u32 sent_states;
u32 recv_states;
u32 sent_probes;
@@ -181,7 +181,10 @@ struct tipc_link {
u16 acked;
struct tipc_link *bc_rcvlink;
struct tipc_link *bc_sndlink;
- int nack_state;
+ unsigned long prev_retr;
+ u16 prev_from;
+ u16 prev_to;
+ u8 nack_state;
bool bc_peer_is_up;
/* Statistics */
@@ -202,6 +205,8 @@ enum {
BC_NACK_SND_SUPPRESS,
};
+#define TIPC_BC_RETR_LIMIT 10 /* [ms] */
+
/*
* Interval between NACKs when packets arrive out of order
*/
@@ -237,8 +242,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
u16 rcvgap, int tolerance, int priority,
struct sk_buff_head *xmitq);
static void link_print(struct tipc_link *l, const char *str);
-static void tipc_link_build_nack_msg(struct tipc_link *l,
- struct sk_buff_head *xmitq);
+static int tipc_link_build_nack_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq);
static void tipc_link_build_bc_init_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
static bool tipc_link_release_pkts(struct tipc_link *l, u16 to);
@@ -367,6 +372,18 @@ int tipc_link_bc_peers(struct tipc_link *l)
return l->ackers;
}
+u16 link_bc_rcv_gap(struct tipc_link *l)
+{
+ struct sk_buff *skb = skb_peek(&l->deferdq);
+ u16 gap = 0;
+
+ if (more(l->snd_nxt, l->rcv_nxt))
+ gap = l->snd_nxt - l->rcv_nxt;
+ if (skb)
+ gap = buf_seqno(skb) - l->rcv_nxt;
+ return gap;
+}
+
void tipc_link_set_mtu(struct tipc_link *l, int mtu)
{
l->mtu = mtu;
@@ -807,7 +824,7 @@ void link_prepare_wakeup(struct tipc_link *l)
skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
imp = TIPC_SKB_CB(skb)->chain_imp;
- lim = l->window + l->backlog[imp].limit;
+ lim = l->backlog[imp].limit;
pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
if ((pnd[imp] + l->backlog[imp].len) >= lim)
break;
@@ -840,7 +857,6 @@ void tipc_link_reset(struct tipc_link *l)
l->acked = 0;
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
- l->stats.recv_info = 0;
l->stale_count = 0;
l->bc_peer_is_up = false;
memset(&l->mon_state, 0, sizeof(l->mon_state));
@@ -871,17 +887,25 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *transmq = &l->transmq;
struct sk_buff_head *backlogq = &l->backlogq;
struct sk_buff *skb, *_skb, *bskb;
+ int pkt_cnt = skb_queue_len(list);
/* Match msg importance against this and all higher backlog limits: */
- for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
- if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
- return link_schedule_user(l, list);
+ if (!skb_queue_empty(backlogq)) {
+ for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
+ if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
+ return link_schedule_user(l, list);
+ }
}
if (unlikely(msg_size(hdr) > mtu)) {
skb_queue_purge(list);
return -EMSGSIZE;
}
+ if (pkt_cnt > 1) {
+ l->stats.sent_fragmented++;
+ l->stats.sent_fragments += pkt_cnt;
+ }
+
/* Prepare each packet for sending, and add to relevant queue: */
while (skb_queue_len(list)) {
skb = skb_peek(list);
@@ -901,6 +925,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
__skb_queue_tail(xmitq, _skb);
TIPC_SKB_CB(skb)->ackers = l->ackers;
l->rcv_unacked = 0;
+ l->stats.sent_pkts++;
seqno++;
continue;
}
@@ -949,6 +974,7 @@ void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
msg_set_ack(hdr, ack);
msg_set_bcast_ack(hdr, bc_ack);
l->rcv_unacked = 0;
+ l->stats.sent_pkts++;
seqno++;
}
l->snd_nxt = seqno;
@@ -1133,7 +1159,10 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
return 0;
l->rcv_unacked = 0;
- return TIPC_LINK_SND_BC_ACK;
+
+ /* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */
+ l->snd_nxt = l->rcv_nxt;
+ return TIPC_LINK_SND_STATE;
}
/* Unicast ACK */
@@ -1162,17 +1191,26 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
}
/* tipc_link_build_nack_msg: prepare link nack message for transmission
+ * Note that sending of broadcast NACK is coordinated among nodes, to
+ * reduce the risk of NACK storms towards the sender
*/
-static void tipc_link_build_nack_msg(struct tipc_link *l,
- struct sk_buff_head *xmitq)
+static int tipc_link_build_nack_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
{
u32 def_cnt = ++l->stats.deferred_recv;
+ int match1, match2;
- if (link_is_bc_rcvlink(l))
- return;
+ if (link_is_bc_rcvlink(l)) {
+ match1 = def_cnt & 0xf;
+ match2 = tipc_own_addr(l->net) & 0xf;
+ if (match1 == match2)
+ return TIPC_LINK_SND_STATE;
+ return 0;
+ }
if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV))
tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
+ return 0;
}
/* tipc_link_rcv - process TIPC packets/messages arriving from off-node
@@ -1223,18 +1261,18 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Defer delivery if sequence gap */
if (unlikely(seqno != rcv_nxt)) {
__tipc_skb_queue_sorted(defq, seqno, skb);
- tipc_link_build_nack_msg(l, xmitq);
+ rc |= tipc_link_build_nack_msg(l, xmitq);
break;
}
/* Deliver packet */
l->rcv_nxt++;
- l->stats.recv_info++;
+ l->stats.recv_pkts++;
if (!tipc_data_input(l, skb, l->inputq))
rc |= tipc_link_input(l, skb, l->inputq);
if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
rc |= tipc_link_build_state_msg(l, xmitq);
- if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK))
+ if (unlikely(rc & ~TIPC_LINK_SND_STATE))
break;
} while ((skb = __skb_dequeue(defq)));
@@ -1248,10 +1286,11 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
u16 rcvgap, int tolerance, int priority,
struct sk_buff_head *xmitq)
{
+ struct tipc_link *bcl = l->bc_rcvlink;
struct sk_buff *skb;
struct tipc_msg *hdr;
struct sk_buff_head *dfq = &l->deferdq;
- bool node_up = link_is_up(l->bc_rcvlink);
+ bool node_up = link_is_up(bcl);
struct tipc_mon_state *mstate = &l->mon_state;
int dlen = 0;
void *data;
@@ -1279,7 +1318,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
msg_set_net_plane(hdr, l->net_plane);
msg_set_next_sent(hdr, l->snd_nxt);
msg_set_ack(hdr, l->rcv_nxt - 1);
- msg_set_bcast_ack(hdr, l->bc_rcvlink->rcv_nxt - 1);
+ msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1);
+ msg_set_bc_ack_invalid(hdr, !node_up);
msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1);
msg_set_link_tolerance(hdr, tolerance);
msg_set_linkprio(hdr, priority);
@@ -1289,6 +1329,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
if (mtyp == STATE_MSG) {
msg_set_seq_gap(hdr, rcvgap);
+ msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
msg_set_probe(hdr, probe);
tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id);
msg_set_size(hdr, INT_H_SIZE + dlen);
@@ -1354,7 +1395,7 @@ tnl:
msg_set_seqno(hdr, seqno++);
pktlen = msg_size(hdr);
msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
- tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE);
+ tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
if (!tnlskb) {
pr_warn("%sunable to send packet\n", link_co_err);
return;
@@ -1458,8 +1499,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
l->tolerance = peers_tol;
- if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI,
- TIPC_MAX_LINK_PRI)) {
+ /* Update own prio if peer indicates a different value */
+ if ((peers_prio != l->priority) &&
+ in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) {
l->priority = peers_prio;
rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
@@ -1541,6 +1583,7 @@ static void tipc_link_build_bc_init_msg(struct tipc_link *l,
__skb_queue_head_init(&list);
if (!tipc_link_build_bc_proto_msg(l->bc_rcvlink, false, 0, &list))
return;
+ msg_set_bc_ack_invalid(buf_msg(skb_peek(&list)), true);
tipc_link_xmit(l, &list, xmitq);
}
@@ -1571,51 +1614,107 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
l->rcv_nxt = peers_snd_nxt;
}
+/* link_bc_retr eval()- check if the indicated range can be retransmitted now
+ * - Adjust permitted range if there is overlap with previous retransmission
+ */
+static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to)
+{
+ unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
+
+ if (less(*to, *from))
+ return false;
+
+ /* New retransmission request */
+ if ((elapsed > TIPC_BC_RETR_LIMIT) ||
+ less(*to, l->prev_from) || more(*from, l->prev_to)) {
+ l->prev_from = *from;
+ l->prev_to = *to;
+ l->prev_retr = jiffies;
+ return true;
+ }
+
+ /* Inside range of previous retransmit */
+ if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
+ return false;
+
+ /* Fully or partially outside previous range => exclude overlap */
+ if (less(*from, l->prev_from)) {
+ *to = l->prev_from - 1;
+ l->prev_from = *from;
+ }
+ if (more(*to, l->prev_to)) {
+ *from = l->prev_to + 1;
+ l->prev_to = *to;
+ }
+ l->prev_retr = jiffies;
+ return true;
+}
+
/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
*/
-void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
- struct sk_buff_head *xmitq)
+int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
+ struct sk_buff_head *xmitq)
{
+ struct tipc_link *snd_l = l->bc_sndlink;
u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
+ u16 from = msg_bcast_ack(hdr) + 1;
+ u16 to = from + msg_bc_gap(hdr) - 1;
+ int rc = 0;
if (!link_is_up(l))
- return;
+ return rc;
if (!msg_peer_node_is_up(hdr))
- return;
+ return rc;
/* Open when peer ackowledges our bcast init msg (pkt #1) */
if (msg_ack(hdr))
l->bc_peer_is_up = true;
if (!l->bc_peer_is_up)
- return;
+ return rc;
+
+ l->stats.recv_nacks++;
/* Ignore if peers_snd_nxt goes beyond receive window */
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
- return;
+ return rc;
+
+ if (link_bc_retr_eval(snd_l, &from, &to))
+ rc = tipc_link_retrans(snd_l, from, to, xmitq);
+
+ l->snd_nxt = peers_snd_nxt;
+ if (link_bc_rcv_gap(l))
+ rc |= TIPC_LINK_SND_STATE;
+
+ /* Return now if sender supports nack via STATE messages */
+ if (l->peer_caps & TIPC_BCAST_STATE_NACK)
+ return rc;
+
+ /* Otherwise, be backwards compatible */
if (!more(peers_snd_nxt, l->rcv_nxt)) {
l->nack_state = BC_NACK_SND_CONDITIONAL;
- return;
+ return 0;
}
/* Don't NACK if one was recently sent or peeked */
if (l->nack_state == BC_NACK_SND_SUPPRESS) {
l->nack_state = BC_NACK_SND_UNCONDITIONAL;
- return;
+ return 0;
}
/* Conditionally delay NACK sending until next synch rcv */
if (l->nack_state == BC_NACK_SND_CONDITIONAL) {
l->nack_state = BC_NACK_SND_UNCONDITIONAL;
if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN)
- return;
+ return 0;
}
/* Send NACK now but suppress next one */
tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq);
l->nack_state = BC_NACK_SND_SUPPRESS;
+ return 0;
}
void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
@@ -1652,6 +1751,8 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
}
/* tipc_link_bc_nack_rcv(): receive broadcast nack message
+ * This function is here for backwards compatibility, since
+ * no BCAST_PROTOCOL/STATE messages occur from TIPC v2.5.
*/
int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *xmitq)
@@ -1692,10 +1793,10 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE);
l->window = win;
- l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2;
- l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win;
- l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3;
- l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2;
+ l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win);
+ l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2);
+ l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3);
+ l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4);
l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk;
}
@@ -1706,10 +1807,6 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
void tipc_link_reset_stats(struct tipc_link *l)
{
memset(&l->stats, 0, sizeof(l->stats));
- if (!link_is_bc_sndlink(l)) {
- l->stats.sent_info = l->snd_nxt;
- l->stats.recv_info = l->rcv_nxt;
- }
}
static void link_print(struct tipc_link *l, const char *str)
@@ -1773,12 +1870,12 @@ static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s)
};
struct nla_map map[] = {
- {TIPC_NLA_STATS_RX_INFO, s->recv_info},
+ {TIPC_NLA_STATS_RX_INFO, 0},
{TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments},
{TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented},
{TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles},
{TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled},
- {TIPC_NLA_STATS_TX_INFO, s->sent_info},
+ {TIPC_NLA_STATS_TX_INFO, 0},
{TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments},
{TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented},
{TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles},
@@ -1853,9 +1950,9 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
goto attr_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->stats.recv_pkts))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->stats.sent_pkts))
goto attr_msg_full;
if (tipc_link_is_up(link))
@@ -1910,12 +2007,12 @@ static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
};
struct nla_map map[] = {
- {TIPC_NLA_STATS_RX_INFO, stats->recv_info},
+ {TIPC_NLA_STATS_RX_INFO, stats->recv_pkts},
{TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
{TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
{TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
{TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
- {TIPC_NLA_STATS_TX_INFO, stats->sent_info},
+ {TIPC_NLA_STATS_TX_INFO, stats->sent_pkts},
{TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
{TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
{TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
@@ -1982,9 +2079,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
goto attr_msg_full;
if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, 0))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0))
goto attr_msg_full;
prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
diff --git a/net/tipc/link.h b/net/tipc/link.h
index d7e9d42fcb2d..d1bd1787a768 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -63,7 +63,7 @@ enum {
enum {
TIPC_LINK_UP_EVT = 1,
TIPC_LINK_DOWN_EVT = (1 << 1),
- TIPC_LINK_SND_BC_ACK = (1 << 2)
+ TIPC_LINK_SND_STATE = (1 << 2)
};
/* Starting value for maximum packet size negotiation on unicast links
@@ -138,8 +138,8 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
void tipc_link_build_bc_sync_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr);
-void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
- struct sk_buff_head *xmitq);
+int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
+ struct sk_buff_head *xmitq);
int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *xmitq);
#endif
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index ed97a5876ebe..9e109bb1a207 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -455,14 +455,14 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
int i, applied_bef;
state->probing = false;
- if (!dlen)
- return;
/* Sanity check received domain record */
- if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) {
- pr_warn_ratelimited("Received illegal domain record\n");
+ if (dlen < dom_rec_len(arrv_dom, 0))
+ return;
+ if (dlen != dom_rec_len(arrv_dom, new_member_cnt))
+ return;
+ if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen)
return;
- }
/* Synch generation numbers with peer if link just came up */
if (!state->synched) {
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 17201aa8423d..ab02d0742476 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -58,12 +58,12 @@ static unsigned int align(unsigned int i)
* NOTE: Headroom is reserved to allow prepending of a data link header.
* There may also be unrequested tailroom present at the buffer's end.
*/
-struct sk_buff *tipc_buf_acquire(u32 size)
+struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
{
struct sk_buff *skb;
unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
- skb = alloc_skb_fclone(buf_size, GFP_ATOMIC);
+ skb = alloc_skb_fclone(buf_size, gfp);
if (skb) {
skb_reserve(skb, BUF_HEADROOM);
skb_put(skb, size);
@@ -95,7 +95,7 @@ struct sk_buff *tipc_msg_create(uint user, uint type,
struct tipc_msg *msg;
struct sk_buff *buf;
- buf = tipc_buf_acquire(hdr_sz + data_sz);
+ buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC);
if (unlikely(!buf))
return NULL;
@@ -261,14 +261,14 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
/* No fragmentation needed? */
if (likely(msz <= pktmax)) {
- skb = tipc_buf_acquire(msz);
+ skb = tipc_buf_acquire(msz, GFP_KERNEL);
if (unlikely(!skb))
return -ENOMEM;
skb_orphan(skb);
__skb_queue_tail(list, skb);
skb_copy_to_linear_data(skb, mhdr, mhsz);
pktpos = skb->data + mhsz;
- if (copy_from_iter(pktpos, dsz, &m->msg_iter) == dsz)
+ if (copy_from_iter_full(pktpos, dsz, &m->msg_iter))
return dsz;
rc = -EFAULT;
goto error;
@@ -282,7 +282,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
msg_set_importance(&pkthdr, msg_importance(mhdr));
/* Prepare first fragment */
- skb = tipc_buf_acquire(pktmax);
+ skb = tipc_buf_acquire(pktmax, GFP_KERNEL);
if (!skb)
return -ENOMEM;
skb_orphan(skb);
@@ -299,7 +299,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
if (drem < pktrem)
pktrem = drem;
- if (copy_from_iter(pktpos, pktrem, &m->msg_iter) != pktrem) {
+ if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) {
rc = -EFAULT;
goto error;
}
@@ -313,7 +313,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
pktsz = drem + INT_H_SIZE;
else
pktsz = pktmax;
- skb = tipc_buf_acquire(pktsz);
+ skb = tipc_buf_acquire(pktsz, GFP_KERNEL);
if (!skb) {
rc = -ENOMEM;
goto error;
@@ -448,7 +448,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
if (msz > (max / 2))
return false;
- _skb = tipc_buf_acquire(max);
+ _skb = tipc_buf_acquire(max, GFP_ATOMIC);
if (!_skb)
return false;
@@ -496,7 +496,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
/* Never return SHORT header; expand by replacing buffer if necessary */
if (msg_short(hdr)) {
- *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen);
+ *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC);
if (!*skb)
goto exit;
memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 7cf52fb39bee..2c3dc38abf9c 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -95,7 +95,7 @@ struct plist;
#define TIPC_MEDIA_INFO_OFFSET 5
struct tipc_skb_cb {
- void *handle;
+ u32 bytes_read;
struct sk_buff *tail;
bool validated;
bool wakeup_pending;
@@ -714,11 +714,38 @@ static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s)
msg_set_bits(m, 5, 13, 0x1, s);
}
+static inline bool msg_bc_ack_invalid(struct tipc_msg *m)
+{
+ switch (msg_user(m)) {
+ case BCAST_PROTOCOL:
+ case NAME_DISTRIBUTOR:
+ case LINK_PROTOCOL:
+ return msg_bits(m, 5, 14, 0x1);
+ default:
+ return false;
+ }
+}
+
+static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid)
+{
+ msg_set_bits(m, 5, 14, 0x1, invalid);
+}
+
static inline char *msg_media_addr(struct tipc_msg *m)
{
return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET];
}
+static inline u32 msg_bc_gap(struct tipc_msg *m)
+{
+ return msg_bits(m, 8, 0, 0x3ff);
+}
+
+static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 8, 0, 0x3ff, n);
+}
+
/*
* Word 9
*/
@@ -793,7 +820,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr)
return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
}
-struct sk_buff *tipc_buf_acquire(u32 size);
+struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp);
bool tipc_msg_validate(struct sk_buff *skb);
bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index a04fe9be1c60..23f8899e0f8c 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -69,7 +69,7 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
u32 dest)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size);
+ struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC);
struct tipc_msg *msg;
if (buf != NULL) {
@@ -156,6 +156,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
pr_warn("Bulk publication failure\n");
return;
}
+ msg_set_bc_ack_invalid(buf_msg(skb), true);
item = (struct distr_item *)msg_data(buf_msg(skb));
}
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 77a7a118911d..c7c254902873 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -39,6 +39,8 @@
#include <net/genetlink.h>
+extern const struct nla_policy tipc_nl_net_policy[];
+
int tipc_net_start(struct net *net, u32 addr);
void tipc_net_stop(struct net *net);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index a84daec0afe9..26ca8dd64ded 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -41,6 +41,7 @@
#include "link.h"
#include "node.h"
#include "net.h"
+#include "udp_media.h"
#include <net/genetlink.h>
static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
@@ -134,15 +135,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
/* Users of the legacy API (tipc-config) can't handle that we add operations,
* so we have a separate genl handling for the new API.
*/
-struct genl_family tipc_genl_family = {
- .id = GENL_ID_GENERATE,
- .name = TIPC_GENL_V2_NAME,
- .version = TIPC_GENL_V2_VERSION,
- .hdrsize = 0,
- .maxattr = TIPC_NLA_MAX,
- .netnsok = true,
-};
-
static const struct genl_ops tipc_genl_v2_ops[] = {
{
.cmd = TIPC_NL_BEARER_DISABLE,
@@ -161,6 +153,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
.policy = tipc_nl_policy,
},
{
+ .cmd = TIPC_NL_BEARER_ADD,
+ .doit = tipc_nl_bearer_add,
+ .policy = tipc_nl_policy,
+ },
+ {
.cmd = TIPC_NL_BEARER_SET,
.doit = tipc_nl_bearer_set,
.policy = tipc_nl_policy,
@@ -238,25 +235,47 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
.dumpit = tipc_nl_node_dump_monitor_peer,
.policy = tipc_nl_policy,
},
+ {
+ .cmd = TIPC_NL_PEER_REMOVE,
+ .doit = tipc_nl_peer_rm,
+ .policy = tipc_nl_policy,
+ },
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ {
+ .cmd = TIPC_NL_UDP_GET_REMOTEIP,
+ .dumpit = tipc_udp_nl_dump_remoteip,
+ .policy = tipc_nl_policy,
+ },
+#endif
+};
+
+struct genl_family tipc_genl_family __ro_after_init = {
+ .name = TIPC_GENL_V2_NAME,
+ .version = TIPC_GENL_V2_VERSION,
+ .hdrsize = 0,
+ .maxattr = TIPC_NLA_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tipc_genl_v2_ops,
+ .n_ops = ARRAY_SIZE(tipc_genl_v2_ops),
};
int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
{
u32 maxattr = tipc_genl_family.maxattr;
- *attr = tipc_genl_family.attrbuf;
+ *attr = genl_family_attrbuf(&tipc_genl_family);
if (!*attr)
return -EOPNOTSUPP;
return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy);
}
-int tipc_netlink_start(void)
+int __init tipc_netlink_start(void)
{
int res;
- res = genl_register_family_with_ops(&tipc_genl_family,
- tipc_genl_v2_ops);
+ res = genl_register_family(&tipc_genl_family);
if (res) {
pr_err("Failed to register netlink interface\n");
return res;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1fd464764765..e1ae8a8a2b8e 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1215,15 +1215,6 @@ send:
return err;
}
-static struct genl_family tipc_genl_compat_family = {
- .id = GENL_ID_GENERATE,
- .name = TIPC_GENL_NAME,
- .version = TIPC_GENL_VERSION,
- .hdrsize = TIPC_GENL_HDRLEN,
- .maxattr = 0,
- .netnsok = true,
-};
-
static struct genl_ops tipc_genl_compat_ops[] = {
{
.cmd = TIPC_GENL_CMD,
@@ -1231,12 +1222,22 @@ static struct genl_ops tipc_genl_compat_ops[] = {
},
};
-int tipc_netlink_compat_start(void)
+static struct genl_family tipc_genl_compat_family __ro_after_init = {
+ .name = TIPC_GENL_NAME,
+ .version = TIPC_GENL_VERSION,
+ .hdrsize = TIPC_GENL_HDRLEN,
+ .maxattr = 0,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tipc_genl_compat_ops,
+ .n_ops = ARRAY_SIZE(tipc_genl_compat_ops),
+};
+
+int __init tipc_netlink_compat_start(void)
{
int res;
- res = genl_register_family_with_ops(&tipc_genl_compat_family,
- tipc_genl_compat_ops);
+ res = genl_register_family(&tipc_genl_compat_family);
if (res) {
pr_err("Failed to register legacy compat interface\n");
return res;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 21974191e425..27753325e06e 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -263,6 +263,11 @@ static void tipc_node_write_lock(struct tipc_node *n)
write_lock_bh(&n->lock);
}
+static void tipc_node_write_unlock_fast(struct tipc_node *n)
+{
+ write_unlock_bh(&n->lock);
+}
+
static void tipc_node_write_unlock(struct tipc_node *n)
{
struct net *net = n->net;
@@ -417,7 +422,7 @@ void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr)
}
tipc_node_write_lock(n);
list_add_tail(subscr, &n->publ_list);
- tipc_node_write_unlock(n);
+ tipc_node_write_unlock_fast(n);
tipc_node_put(n);
}
@@ -435,7 +440,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr)
}
tipc_node_write_lock(n);
list_del_init(subscr);
- tipc_node_write_unlock(n);
+ tipc_node_write_unlock_fast(n);
tipc_node_put(n);
}
@@ -1262,6 +1267,34 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
kfree_skb(skb);
}
+static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
+ int bearer_id, struct sk_buff_head *xmitq)
+{
+ struct tipc_link *ucl;
+ int rc;
+
+ rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr);
+
+ if (rc & TIPC_LINK_DOWN_EVT) {
+ tipc_bearer_reset_all(n->net);
+ return;
+ }
+
+ if (!(rc & TIPC_LINK_SND_STATE))
+ return;
+
+ /* If probe message, a STATE response will be sent anyway */
+ if (msg_probe(hdr))
+ return;
+
+ /* Produce a STATE message carrying broadcast NACK */
+ tipc_node_read_lock(n);
+ ucl = n->links[bearer_id].link;
+ if (ucl)
+ tipc_link_build_state_msg(ucl, xmitq);
+ tipc_node_read_unlock(n);
+}
+
/**
* tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node
* @net: the applicable net namespace
@@ -1298,7 +1331,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
rc = tipc_bcast_rcv(net, be->link, skb);
/* Broadcast ACKs are sent on a unicast link */
- if (rc & TIPC_LINK_SND_BC_ACK) {
+ if (rc & TIPC_LINK_SND_STATE) {
tipc_node_read_lock(n);
tipc_link_build_state_msg(le->link, &xmitq);
tipc_node_read_unlock(n);
@@ -1505,9 +1538,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
/* Ensure broadcast reception is in synch with peer's send state */
if (unlikely(usr == LINK_PROTOCOL))
- tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr);
+ tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq);
else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack))
- tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack);
+ tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr);
/* Receive packet directly if conditions permit */
tipc_node_read_lock(n);
@@ -1553,6 +1586,69 @@ discard:
kfree_skb(skb);
}
+int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
+ struct tipc_node *peer;
+ u32 addr;
+ int err;
+ int i;
+
+ /* We identify the peer by its net */
+ if (!info->attrs[TIPC_NLA_NET])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX,
+ info->attrs[TIPC_NLA_NET],
+ tipc_nl_net_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_NET_ADDR])
+ return -EINVAL;
+
+ addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
+
+ if (in_own_node(net, addr))
+ return -ENOTSUPP;
+
+ spin_lock_bh(&tn->node_list_lock);
+ peer = tipc_node_find(net, addr);
+ if (!peer) {
+ spin_unlock_bh(&tn->node_list_lock);
+ return -ENXIO;
+ }
+
+ tipc_node_write_lock(peer);
+ if (peer->state != SELF_DOWN_PEER_DOWN &&
+ peer->state != SELF_DOWN_PEER_LEAVING) {
+ tipc_node_write_unlock(peer);
+ err = -EBUSY;
+ goto err_out;
+ }
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ struct tipc_link_entry *le = &peer->links[i];
+
+ if (le->link) {
+ kfree(le->link);
+ le->link = NULL;
+ peer->link_cnt--;
+ }
+ }
+ tipc_node_write_unlock(peer);
+ tipc_node_delete(peer);
+
+ err = 0;
+err_out:
+ tipc_node_put(peer);
+ spin_unlock_bh(&tn->node_list_lock);
+
+ return err;
+}
+
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int err;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index d69fdfcc0ec9..39ef54c1f2ad 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -1,7 +1,7 @@
/*
* net/tipc/node.h: Include file for TIPC node management routines
*
- * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2000-2006, 2014-2016, Ericsson AB
* Copyright (c) 2005, 2010-2014, Wind River Systems
* All rights reserved.
*
@@ -45,11 +45,14 @@
/* Optional capabilities supported by this code version
*/
enum {
- TIPC_BCAST_SYNCH = (1 << 1),
- TIPC_BLOCK_FLOWCTL = (2 << 1)
+ TIPC_BCAST_SYNCH = (1 << 1),
+ TIPC_BCAST_STATE_NACK = (1 << 2),
+ TIPC_BLOCK_FLOWCTL = (1 << 3)
};
-#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL)
+#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
+ TIPC_BCAST_STATE_NACK | \
+ TIPC_BLOCK_FLOWCTL)
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
@@ -77,6 +80,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 215849ce453d..3cd6402e812c 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -86,12 +86,12 @@ struct outqueue_entry {
static void tipc_recv_work(struct work_struct *work);
static void tipc_send_work(struct work_struct *work);
static void tipc_clean_outqueues(struct tipc_conn *con);
-static void tipc_sock_release(struct tipc_conn *con);
static void tipc_conn_kref_release(struct kref *kref)
{
struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
- struct sockaddr_tipc *saddr = con->server->saddr;
+ struct tipc_server *s = con->server;
+ struct sockaddr_tipc *saddr = s->saddr;
struct socket *sock = con->sock;
struct sock *sk;
@@ -103,9 +103,13 @@ static void tipc_conn_kref_release(struct kref *kref)
}
saddr->scope = -TIPC_NODE_SCOPE;
kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
- tipc_sock_release(con);
sock_release(sock);
con->sock = NULL;
+
+ spin_lock_bh(&s->idr_lock);
+ idr_remove(&s->conn_idr, con->conid);
+ s->idr_in_use--;
+ spin_unlock_bh(&s->idr_lock);
}
tipc_clean_outqueues(con);
@@ -128,8 +132,10 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
spin_lock_bh(&s->idr_lock);
con = idr_find(&s->conn_idr, conid);
- if (con)
+ if (con && test_bit(CF_CONNECTED, &con->flags))
conn_get(con);
+ else
+ con = NULL;
spin_unlock_bh(&s->idr_lock);
return con;
}
@@ -186,26 +192,15 @@ static void tipc_unregister_callbacks(struct tipc_conn *con)
write_unlock_bh(&sk->sk_callback_lock);
}
-static void tipc_sock_release(struct tipc_conn *con)
-{
- struct tipc_server *s = con->server;
-
- if (con->conid)
- s->tipc_conn_release(con->conid, con->usr_data);
-
- tipc_unregister_callbacks(con);
-}
-
static void tipc_close_conn(struct tipc_conn *con)
{
struct tipc_server *s = con->server;
if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
+ tipc_unregister_callbacks(con);
- spin_lock_bh(&s->idr_lock);
- idr_remove(&s->conn_idr, con->conid);
- s->idr_in_use--;
- spin_unlock_bh(&s->idr_lock);
+ if (con->conid)
+ s->tipc_conn_release(con->conid, con->usr_data);
/* We shouldn't flush pending works as we may be in the
* thread. In fact the races with pending rx/tx work structs
@@ -458,6 +453,11 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
if (!con)
return -EINVAL;
+ if (!test_bit(CF_CONNECTED, &con->flags)) {
+ conn_put(con);
+ return 0;
+ }
+
e = tipc_alloc_entry(data, len);
if (!e) {
conn_put(con);
@@ -471,12 +471,8 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
list_add_tail(&e->list, &con->outqueue);
spin_unlock_bh(&con->outqueue_lock);
- if (test_bit(CF_CONNECTED, &con->flags)) {
- if (!queue_work(s->send_wq, &con->swork))
- conn_put(con);
- } else {
+ if (!queue_work(s->send_wq, &con->swork))
conn_put(con);
- }
return 0;
}
@@ -500,7 +496,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
int ret;
spin_lock_bh(&con->outqueue_lock);
- while (1) {
+ while (test_bit(CF_CONNECTED, &con->flags)) {
e = list_entry(con->outqueue.next, struct outqueue_entry,
list);
if ((struct list_head *) e == &con->outqueue)
@@ -623,14 +619,12 @@ int tipc_server_start(struct tipc_server *s)
void tipc_server_stop(struct tipc_server *s)
{
struct tipc_conn *con;
- int total = 0;
int id;
spin_lock_bh(&s->idr_lock);
- for (id = 0; total < s->idr_in_use; id++) {
+ for (id = 0; s->idr_in_use; id++) {
con = idr_find(&s->conn_idr, id);
if (con) {
- total++;
spin_unlock_bh(&s->idr_lock);
tipc_close_conn(con);
spin_lock_bh(&s->idr_lock);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f9f5f3c3dab5..800caaa699a1 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1,7 +1,7 @@
/*
* net/tipc/socket.c: TIPC socket API
*
- * Copyright (c) 2001-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 2001-2007, 2012-2016, Ericsson AB
* Copyright (c) 2004-2008, 2010-2013, Wind River Systems
* All rights reserved.
*
@@ -44,44 +44,43 @@
#include "bcast.h"
#include "netlink.h"
-#define SS_LISTENING -1 /* socket is listening */
-#define SS_READY -2 /* socket is connectionless */
-
#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */
#define TIPC_FWD_MSG 1
-#define TIPC_CONN_OK 0
-#define TIPC_CONN_PROBING 1
#define TIPC_MAX_PORT 0xffffffff
#define TIPC_MIN_PORT 1
+enum {
+ TIPC_LISTEN = TCP_LISTEN,
+ TIPC_ESTABLISHED = TCP_ESTABLISHED,
+ TIPC_OPEN = TCP_CLOSE,
+ TIPC_DISCONNECTING = TCP_CLOSE_WAIT,
+ TIPC_CONNECTING = TCP_SYN_SENT,
+};
+
/**
* struct tipc_sock - TIPC socket structure
* @sk: socket - interacts with 'port' and with user via the socket API
- * @connected: non-zero if port is currently connected to a peer port
* @conn_type: TIPC type used when connection was established
* @conn_instance: TIPC instance used when connection was established
* @published: non-zero if port has one or more associated names
* @max_pkt: maximum packet size "hint" used when building messages sent by port
* @portid: unique port identity in TIPC socket hash table
* @phdr: preformatted message header used when sending messages
- * @port_list: adjacent ports in TIPC's global list of ports
* @publications: list of publications for port
* @pub_count: total # of publications port has made during its lifetime
* @probing_state:
- * @probing_intv:
* @conn_timeout: the time we can wait for an unresponded setup request
* @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
* @link_cong: non-zero if owner must sleep because of link congestion
* @sent_unacked: # messages sent by socket, and not yet acked by peer
* @rcv_unacked: # messages read by user, but not yet acked back to peer
- * @remote: 'connected' peer for dgram/rdm
+ * @peer: 'connected' peer for dgram/rdm
* @node: hash table node
* @rcu: rcu struct for tipc_sock
*/
struct tipc_sock {
struct sock sk;
- int connected;
u32 conn_type;
u32 conn_instance;
int published;
@@ -91,17 +90,16 @@ struct tipc_sock {
struct list_head sock_list;
struct list_head publications;
u32 pub_count;
- u32 probing_state;
- unsigned long probing_intv;
uint conn_timeout;
atomic_t dupl_rcvcnt;
+ bool probe_unacked;
bool link_cong;
u16 snt_unacked;
u16 snd_win;
u16 peer_caps;
u16 rcv_unacked;
u16 rcv_win;
- struct sockaddr_tipc remote;
+ struct sockaddr_tipc peer;
struct rhash_head node;
struct rcu_head rcu;
};
@@ -129,54 +127,8 @@ static const struct proto_ops packet_ops;
static const struct proto_ops stream_ops;
static const struct proto_ops msg_ops;
static struct proto tipc_proto;
-
static const struct rhashtable_params tsk_rht_params;
-/*
- * Revised TIPC socket locking policy:
- *
- * Most socket operations take the standard socket lock when they start
- * and hold it until they finish (or until they need to sleep). Acquiring
- * this lock grants the owner exclusive access to the fields of the socket
- * data structures, with the exception of the backlog queue. A few socket
- * operations can be done without taking the socket lock because they only
- * read socket information that never changes during the life of the socket.
- *
- * Socket operations may acquire the lock for the associated TIPC port if they
- * need to perform an operation on the port. If any routine needs to acquire
- * both the socket lock and the port lock it must take the socket lock first
- * to avoid the risk of deadlock.
- *
- * The dispatcher handling incoming messages cannot grab the socket lock in
- * the standard fashion, since invoked it runs at the BH level and cannot block.
- * Instead, it checks to see if the socket lock is currently owned by someone,
- * and either handles the message itself or adds it to the socket's backlog
- * queue; in the latter case the queued message is processed once the process
- * owning the socket lock releases it.
- *
- * NOTE: Releasing the socket lock while an operation is sleeping overcomes
- * the problem of a blocked socket operation preventing any other operations
- * from occurring. However, applications must be careful if they have
- * multiple threads trying to send (or receive) on the same socket, as these
- * operations might interfere with each other. For example, doing a connect
- * and a receive at the same time might allow the receive to consume the
- * ACK message meant for the connect. While additional work could be done
- * to try and overcome this, it doesn't seem to be worthwhile at the present.
- *
- * NOTE: Releasing the socket lock while an operation is sleeping also ensures
- * that another operation that must be performed in a non-blocking manner is
- * not delayed for very long because the lock has already been taken.
- *
- * NOTE: This code assumes that certain fields of a port/socket pair are
- * constant over its lifetime; such fields can be examined without taking
- * the socket lock and/or port lock, and do not need to be re-read even
- * after resuming processing after waiting. These fields include:
- * - socket type
- * - pointer to socket sk structure (aka tipc_sock structure)
- * - pointer to port structure
- * - port reference
- */
-
static u32 tsk_own_node(struct tipc_sock *tsk)
{
return msg_prevnode(&tsk->phdr);
@@ -232,7 +184,7 @@ static struct tipc_sock *tipc_sk(const struct sock *sk)
static bool tsk_conn_cong(struct tipc_sock *tsk)
{
- return tsk->snt_unacked >= tsk->snd_win;
+ return tsk->snt_unacked > tsk->snd_win;
}
/* tsk_blocks(): translate a buffer size in bytes to number of
@@ -294,6 +246,21 @@ static void tsk_rej_rx_queue(struct sock *sk)
tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
}
+static bool tipc_sk_connected(struct sock *sk)
+{
+ return sk->sk_state == TIPC_ESTABLISHED;
+}
+
+/* tipc_sk_type_connectionless - check if the socket is datagram socket
+ * @sk: socket
+ *
+ * Returns true if connection less, false otherwise
+ */
+static bool tipc_sk_type_connectionless(struct sock *sk)
+{
+ return sk->sk_type == SOCK_RDM || sk->sk_type == SOCK_DGRAM;
+}
+
/* tsk_peer_msg - verify if message was sent by connected port's peer
*
* Handles cases where the node's network address has changed from
@@ -301,12 +268,13 @@ static void tsk_rej_rx_queue(struct sock *sk)
*/
static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
{
- struct tipc_net *tn = net_generic(sock_net(&tsk->sk), tipc_net_id);
+ struct sock *sk = &tsk->sk;
+ struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);
u32 peer_port = tsk_peer_port(tsk);
u32 orig_node;
u32 peer_node;
- if (unlikely(!tsk->connected))
+ if (unlikely(!tipc_sk_connected(sk)))
return false;
if (unlikely(msg_origport(msg) != peer_port))
@@ -327,6 +295,45 @@ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
return false;
}
+/* tipc_set_sk_state - set the sk_state of the socket
+ * @sk: socket
+ *
+ * Caller must hold socket lock
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_set_sk_state(struct sock *sk, int state)
+{
+ int oldsk_state = sk->sk_state;
+ int res = -EINVAL;
+
+ switch (state) {
+ case TIPC_OPEN:
+ res = 0;
+ break;
+ case TIPC_LISTEN:
+ case TIPC_CONNECTING:
+ if (oldsk_state == TIPC_OPEN)
+ res = 0;
+ break;
+ case TIPC_ESTABLISHED:
+ if (oldsk_state == TIPC_CONNECTING ||
+ oldsk_state == TIPC_OPEN)
+ res = 0;
+ break;
+ case TIPC_DISCONNECTING:
+ if (oldsk_state == TIPC_CONNECTING ||
+ oldsk_state == TIPC_ESTABLISHED)
+ res = 0;
+ break;
+ }
+
+ if (!res)
+ sk->sk_state = state;
+
+ return res;
+}
+
/**
* tipc_sk_create - create a TIPC socket
* @net: network namespace (must be default network)
@@ -344,7 +351,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
{
struct tipc_net *tn;
const struct proto_ops *ops;
- socket_state state;
struct sock *sk;
struct tipc_sock *tsk;
struct tipc_msg *msg;
@@ -356,16 +362,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
switch (sock->type) {
case SOCK_STREAM:
ops = &stream_ops;
- state = SS_UNCONNECTED;
break;
case SOCK_SEQPACKET:
ops = &packet_ops;
- state = SS_UNCONNECTED;
break;
case SOCK_DGRAM:
case SOCK_RDM:
ops = &msg_ops;
- state = SS_READY;
break;
default:
return -EPROTOTYPE;
@@ -386,14 +389,15 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
/* Finish initializing socket data structures */
sock->ops = ops;
- sock->state = state;
sock_init_data(sock, sk);
+ tipc_set_sk_state(sk, TIPC_OPEN);
if (tipc_sk_insert(tsk)) {
pr_warn("Socket create failed; port number exhausted\n");
return -EINVAL;
}
msg_set_origport(msg, tsk->portid);
setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
+ sk->sk_shutdown = 0;
sk->sk_backlog_rcv = tipc_backlog_rcv;
sk->sk_rcvbuf = sysctl_tipc_rmem[1];
sk->sk_data_ready = tipc_data_ready;
@@ -406,11 +410,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN);
tsk->rcv_win = tsk->snd_win;
- if (sock->state == SS_READY) {
+ if (tipc_sk_type_connectionless(sk)) {
tsk_set_unreturnable(tsk, true);
if (sock->type == SOCK_DGRAM)
tsk_set_unreliable(tsk, true);
}
+
return 0;
}
@@ -421,6 +426,46 @@ static void tipc_sk_callback(struct rcu_head *head)
sock_put(&tsk->sk);
}
+/* Caller should hold socket lock for the socket. */
+static void __tipc_shutdown(struct socket *sock, int error)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 dnode = tsk_peer_node(tsk);
+ struct sk_buff *skb;
+
+ /* Reject all unreceived messages, except on an active connection
+ * (which disconnects locally & sends a 'FIN+' to peer).
+ */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (TIPC_SKB_CB(skb)->bytes_read) {
+ kfree_skb(skb);
+ continue;
+ }
+ if (!tipc_sk_type_connectionless(sk) &&
+ sk->sk_state != TIPC_DISCONNECTING) {
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ }
+ tipc_sk_respond(sk, skb, error);
+ }
+
+ if (tipc_sk_type_connectionless(sk))
+ return;
+
+ if (sk->sk_state != TIPC_DISCONNECTING) {
+ skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
+ TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
+ tsk_own_node(tsk), tsk_peer_port(tsk),
+ tsk->portid, error);
+ if (skb)
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ }
+}
+
/**
* tipc_release - destroy a TIPC socket
* @sock: socket to destroy
@@ -440,10 +485,7 @@ static void tipc_sk_callback(struct rcu_head *head)
static int tipc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
- struct net *net;
struct tipc_sock *tsk;
- struct sk_buff *skb;
- u32 dnode;
/*
* Exit if socket isn't fully initialized (occurs when a failed accept()
@@ -452,47 +494,16 @@ static int tipc_release(struct socket *sock)
if (sk == NULL)
return 0;
- net = sock_net(sk);
tsk = tipc_sk(sk);
lock_sock(sk);
- /*
- * Reject all unreceived messages, except on an active connection
- * (which disconnects locally & sends a 'FIN+' to peer)
- */
- dnode = tsk_peer_node(tsk);
- while (sock->state != SS_DISCONNECTING) {
- skb = __skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL)
- break;
- if (TIPC_SKB_CB(skb)->handle != NULL)
- kfree_skb(skb);
- else {
- if ((sock->state == SS_CONNECTING) ||
- (sock->state == SS_CONNECTED)) {
- sock->state = SS_DISCONNECTING;
- tsk->connected = 0;
- tipc_node_remove_conn(net, dnode, tsk->portid);
- }
- tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
- }
- }
-
+ __tipc_shutdown(sock, TIPC_ERR_NO_PORT);
+ sk->sk_shutdown = SHUTDOWN_MASK;
tipc_sk_withdraw(tsk, 0, NULL);
sk_stop_timer(sk, &sk->sk_timer);
tipc_sk_remove(tsk);
- if (tsk->connected) {
- skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
- TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
- tsk_own_node(tsk), tsk_peer_port(tsk),
- tsk->portid, TIPC_ERR_NO_PORT);
- if (skb)
- tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
- tipc_node_remove_conn(net, dnode, tsk->portid);
- }
/* Reject any messages that accumulated in backlog queue */
- sock->state = SS_DISCONNECTING;
release_sock(sk);
call_rcu(&tsk->rcu, tipc_sk_callback);
@@ -578,13 +589,14 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
int *uaddr_len, int peer)
{
struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
- struct tipc_sock *tsk = tipc_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id);
memset(addr, 0, sizeof(*addr));
if (peer) {
- if ((sock->state != SS_CONNECTED) &&
- ((peer != 2) || (sock->state != SS_DISCONNECTING)))
+ if ((!tipc_sk_connected(sk)) &&
+ ((peer != 2) || (sk->sk_state != TIPC_DISCONNECTING)))
return -ENOTCONN;
addr->addr.id.ref = tsk_peer_port(tsk);
addr->addr.id.node = tsk_peer_node(tsk);
@@ -616,28 +628,6 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
* exits. TCP and other protocols seem to rely on higher level poll routines
* to handle any preventable race conditions, so TIPC will do the same ...
*
- * TIPC sets the returned events as follows:
- *
- * socket state flags set
- * ------------ ---------
- * unconnected no read flags
- * POLLOUT if port is not congested
- *
- * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue
- * no write flags
- *
- * connected POLLIN/POLLRDNORM if data in rx queue
- * POLLOUT if port is not congested
- *
- * disconnecting POLLIN/POLLRDNORM/POLLHUP
- * no write flags
- *
- * listening POLLIN if SYN in rx queue
- * no write flags
- *
- * ready POLLIN/POLLRDNORM if data in rx queue
- * [connectionless] POLLOUT (since port cannot be congested)
- *
* IMPORTANT: The fact that a read or write operation is indicated does NOT
* imply that the operation will succeed, merely that it should be performed
* and will not block.
@@ -651,22 +641,29 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sk_sleep(sk), wait);
- switch ((int)sock->state) {
- case SS_UNCONNECTED:
- if (!tsk->link_cong)
- mask |= POLLOUT;
- break;
- case SS_READY:
- case SS_CONNECTED:
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ switch (sk->sk_state) {
+ case TIPC_ESTABLISHED:
if (!tsk->link_cong && !tsk_conn_cong(tsk))
mask |= POLLOUT;
/* fall thru' */
- case SS_CONNECTING:
- case SS_LISTENING:
+ case TIPC_LISTEN:
+ case TIPC_CONNECTING:
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= (POLLIN | POLLRDNORM);
break;
- case SS_DISCONNECTING:
+ case TIPC_OPEN:
+ if (!tsk->link_cong)
+ mask |= POLLOUT;
+ if (tipc_sk_type_connectionless(sk) &&
+ (!skb_queue_empty(&sk->sk_receive_queue)))
+ mask |= (POLLIN | POLLRDNORM);
+ break;
+ case TIPC_DISCONNECTING:
mask = (POLLIN | POLLRDNORM | POLLHUP);
break;
}
@@ -697,6 +694,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
uint mtu;
int rc;
+ if (!timeo && tsk->link_cong)
+ return -ELINKCONG;
+
msg_set_type(mhdr, TIPC_MCAST_MSG);
msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE);
msg_set_destport(mhdr, 0);
@@ -809,7 +809,7 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
if (!tsk_peer_msg(tsk, hdr))
goto exit;
- tsk->probing_state = TIPC_CONN_OK;
+ tsk->probe_unacked = false;
if (mtyp == CONN_PROBE) {
msg_set_type(hdr, CONN_PROBE_REPLY);
@@ -832,25 +832,25 @@ exit:
static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
- DEFINE_WAIT(wait);
int done;
do {
int err = sock_error(sk);
if (err)
return err;
- if (sock->state == SS_DISCONNECTING)
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
return -EPIPE;
if (!*timeo_p)
return -EAGAIN;
if (signal_pending(current))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, timeo_p, !tsk->link_cong);
- finish_wait(sk_sleep(sk), &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ done = sk_wait_event(sk, timeo_p, !tsk->link_cong, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
} while (!done);
return 0;
}
@@ -890,6 +890,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
struct tipc_msg *mhdr = &tsk->phdr;
u32 dnode, dport;
struct sk_buff_head pktchain;
+ bool is_connectionless = tipc_sk_type_connectionless(sk);
struct sk_buff *skb;
struct tipc_name_seq *seq;
struct iov_iter save;
@@ -900,18 +901,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
if (dsz > TIPC_MAX_USER_MSG_SIZE)
return -EMSGSIZE;
if (unlikely(!dest)) {
- if (tsk->connected && sock->state == SS_READY)
- dest = &tsk->remote;
+ if (is_connectionless && tsk->peer.family == AF_TIPC)
+ dest = &tsk->peer;
else
return -EDESTADDRREQ;
} else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
dest->family != AF_TIPC) {
return -EINVAL;
}
- if (unlikely(sock->state != SS_READY)) {
- if (sock->state == SS_LISTENING)
+ if (!is_connectionless) {
+ if (sk->sk_state == TIPC_LISTEN)
return -EPIPE;
- if (sock->state != SS_UNCONNECTED)
+ if (sk->sk_state != TIPC_OPEN)
return -EISCONN;
if (tsk->published)
return -EOPNOTSUPP;
@@ -963,8 +964,8 @@ new_mtu:
TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
if (likely(!rc)) {
- if (sock->state != SS_READY)
- sock->state = SS_CONNECTING;
+ if (!is_connectionless)
+ tipc_set_sk_state(sk, TIPC_CONNECTING);
return dsz;
}
if (rc == -ELINKCONG) {
@@ -986,30 +987,30 @@ new_mtu:
static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
- DEFINE_WAIT(wait);
int done;
do {
int err = sock_error(sk);
if (err)
return err;
- if (sock->state == SS_DISCONNECTING)
+ if (sk->sk_state == TIPC_DISCONNECTING)
return -EPIPE;
- else if (sock->state != SS_CONNECTED)
+ else if (!tipc_sk_connected(sk))
return -ENOTCONN;
if (!*timeo_p)
return -EAGAIN;
if (signal_pending(current))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
done = sk_wait_event(sk, timeo_p,
(!tsk->link_cong &&
!tsk_conn_cong(tsk)) ||
- !tsk->connected);
- finish_wait(sk_sleep(sk), &wait);
+ !tipc_sk_connected(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
} while (!done);
return 0;
}
@@ -1064,14 +1065,17 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
if (dsz > (uint)INT_MAX)
return -EMSGSIZE;
- if (unlikely(sock->state != SS_CONNECTED)) {
- if (sock->state == SS_DISCONNECTING)
+ if (unlikely(!tipc_sk_connected(sk))) {
+ if (sk->sk_state == TIPC_DISCONNECTING)
return -EPIPE;
else
return -ENOTCONN;
}
timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+ if (!timeo && tsk->link_cong)
+ return -ELINKCONG;
+
dnode = tsk_peer_node(tsk);
skb_queue_head_init(&pktchain);
@@ -1145,10 +1149,8 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
msg_set_lookup_scope(msg, 0);
msg_set_hdr_sz(msg, SHORT_H_SIZE);
- tsk->probing_intv = CONN_PROBING_INTERVAL;
- tsk->probing_state = TIPC_CONN_OK;
- tsk->connected = 1;
- sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL);
+ tipc_set_sk_state(sk, TIPC_ESTABLISHED);
tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
@@ -1256,13 +1258,14 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
static void tipc_sk_send_ack(struct tipc_sock *tsk)
{
- struct net *net = sock_net(&tsk->sk);
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
struct sk_buff *skb = NULL;
struct tipc_msg *msg;
u32 peer_port = tsk_peer_port(tsk);
u32 dnode = tsk_peer_node(tsk);
- if (!tsk->connected)
+ if (!tipc_sk_connected(sk))
return;
skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0,
dnode, tsk_own_node(tsk), peer_port,
@@ -1291,7 +1294,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
- if (sock->state == SS_DISCONNECTING) {
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
err = -ENOTCONN;
break;
}
@@ -1332,6 +1335,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
struct tipc_sock *tsk = tipc_sk(sk);
struct sk_buff *buf;
struct tipc_msg *msg;
+ bool is_connectionless = tipc_sk_type_connectionless(sk);
long timeo;
unsigned int sz;
u32 err;
@@ -1343,7 +1347,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
lock_sock(sk);
- if (unlikely(sock->state == SS_UNCONNECTED)) {
+ if (!is_connectionless && unlikely(sk->sk_state == TIPC_OPEN)) {
res = -ENOTCONN;
goto exit;
}
@@ -1388,8 +1392,8 @@ restart:
goto exit;
res = sz;
} else {
- if ((sock->state == SS_READY) ||
- ((err == TIPC_CONN_SHUTDOWN) || m->msg_control))
+ if (is_connectionless || err == TIPC_CONN_SHUTDOWN ||
+ m->msg_control)
res = 0;
else
res = -ECONNRESET;
@@ -1398,7 +1402,7 @@ restart:
if (unlikely(flags & MSG_PEEK))
goto exit;
- if (likely(sock->state != SS_READY)) {
+ if (likely(!is_connectionless)) {
tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
tipc_sk_send_ack(tsk);
@@ -1429,7 +1433,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
struct tipc_msg *msg;
long timeo;
unsigned int sz;
- int sz_to_copy, target, needed;
+ int target;
int sz_copied = 0;
u32 err;
int res = 0, hlen;
@@ -1440,7 +1444,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
lock_sock(sk);
- if (unlikely(sock->state == SS_UNCONNECTED)) {
+ if (unlikely(sk->sk_state == TIPC_OPEN)) {
res = -ENOTCONN;
goto exit;
}
@@ -1477,11 +1481,13 @@ restart:
/* Capture message data (if valid) & compute return value (always) */
if (!err) {
- u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle);
+ u32 offset = TIPC_SKB_CB(buf)->bytes_read;
+ u32 needed;
+ int sz_to_copy;
sz -= offset;
needed = (buf_len - sz_copied);
- sz_to_copy = (sz <= needed) ? sz : needed;
+ sz_to_copy = min(sz, needed);
res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy);
if (res)
@@ -1491,8 +1497,8 @@ restart:
if (sz_to_copy < sz) {
if (!(flags & MSG_PEEK))
- TIPC_SKB_CB(buf)->handle =
- (void *)(unsigned long)(offset + sz_to_copy);
+ TIPC_SKB_CB(buf)->bytes_read =
+ offset + sz_to_copy;
goto exit;
}
} else {
@@ -1574,49 +1580,31 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
- struct socket *sock = sk->sk_socket;
struct tipc_msg *hdr = buf_msg(skb);
if (unlikely(msg_mcast(hdr)))
return false;
- switch ((int)sock->state) {
- case SS_CONNECTED:
-
- /* Accept only connection-based messages sent by peer */
- if (unlikely(!tsk_peer_msg(tsk, hdr)))
- return false;
-
- if (unlikely(msg_errcode(hdr))) {
- sock->state = SS_DISCONNECTING;
- tsk->connected = 0;
- /* Let timer expire on it's own */
- tipc_node_remove_conn(net, tsk_peer_node(tsk),
- tsk->portid);
- }
- return true;
-
- case SS_CONNECTING:
-
+ switch (sk->sk_state) {
+ case TIPC_CONNECTING:
/* Accept only ACK or NACK message */
if (unlikely(!msg_connected(hdr)))
return false;
if (unlikely(msg_errcode(hdr))) {
- sock->state = SS_DISCONNECTING;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
sk->sk_err = ECONNREFUSED;
return true;
}
if (unlikely(!msg_isdata(hdr))) {
- sock->state = SS_DISCONNECTING;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
sk->sk_err = EINVAL;
return true;
}
tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr));
msg_set_importance(&tsk->phdr, msg_importance(hdr));
- sock->state = SS_CONNECTED;
/* If 'ACK+' message, add to socket receive queue */
if (msg_data_sz(hdr))
@@ -1630,18 +1618,31 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
msg_set_dest_droppable(hdr, 1);
return false;
- case SS_LISTENING:
- case SS_UNCONNECTED:
-
+ case TIPC_OPEN:
+ case TIPC_DISCONNECTING:
+ break;
+ case TIPC_LISTEN:
/* Accept only SYN message */
if (!msg_connected(hdr) && !(msg_errcode(hdr)))
return true;
break;
- case SS_DISCONNECTING:
- break;
+ case TIPC_ESTABLISHED:
+ /* Accept only connection-based messages sent by peer */
+ if (unlikely(!tsk_peer_msg(tsk, hdr)))
+ return false;
+
+ if (unlikely(msg_errcode(hdr))) {
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ /* Let timer expire on it's own */
+ tipc_node_remove_conn(net, tsk_peer_node(tsk),
+ tsk->portid);
+ sk->sk_state_change(sk);
+ }
+ return true;
default:
- pr_err("Unknown socket state %u\n", sock->state);
+ pr_err("Unknown sk_state %u\n", sk->sk_state);
}
+
return false;
}
@@ -1692,7 +1693,6 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *xmitq)
{
- struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_msg *hdr = buf_msg(skb);
unsigned int limit = rcvbuf_limit(sk, skb);
@@ -1718,7 +1718,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
}
/* Reject if wrong message type for current socket state */
- if (unlikely(sock->state == SS_READY)) {
+ if (tipc_sk_type_connectionless(sk)) {
if (msg_connected(hdr)) {
err = TIPC_ERR_NO_PORT;
goto reject;
@@ -1735,7 +1735,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
}
/* Enqueue message */
- TIPC_SKB_CB(skb)->handle = NULL;
+ TIPC_SKB_CB(skb)->bytes_read = 0;
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
@@ -1885,8 +1885,8 @@ xmit:
static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk;
- DEFINE_WAIT(wait);
int done;
do {
@@ -1898,9 +1898,10 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
if (signal_pending(current))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, timeo_p, sock->state != SS_CONNECTING);
- finish_wait(sk_sleep(sk), &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ done = sk_wait_event(sk, timeo_p,
+ sk->sk_state != TIPC_CONNECTING, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
} while (!done);
return 0;
}
@@ -1922,21 +1923,19 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
struct msghdr m = {NULL,};
long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout;
- socket_state previous;
+ int previous;
int res = 0;
lock_sock(sk);
/* DGRAM/RDM connect(), just save the destaddr */
- if (sock->state == SS_READY) {
+ if (tipc_sk_type_connectionless(sk)) {
if (dst->family == AF_UNSPEC) {
- memset(&tsk->remote, 0, sizeof(struct sockaddr_tipc));
- tsk->connected = 0;
+ memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc));
} else if (destlen != sizeof(struct sockaddr_tipc)) {
res = -EINVAL;
} else {
- memcpy(&tsk->remote, dest, destlen);
- tsk->connected = 1;
+ memcpy(&tsk->peer, dest, destlen);
}
goto exit;
}
@@ -1952,9 +1951,10 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
goto exit;
}
- previous = sock->state;
- switch (sock->state) {
- case SS_UNCONNECTED:
+ previous = sk->sk_state;
+
+ switch (sk->sk_state) {
+ case TIPC_OPEN:
/* Send a 'SYN-' to destination */
m.msg_name = dest;
m.msg_namelen = destlen;
@@ -1969,27 +1969,29 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
if ((res < 0) && (res != -EWOULDBLOCK))
goto exit;
- /* Just entered SS_CONNECTING state; the only
+ /* Just entered TIPC_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
res = -EINPROGRESS;
- case SS_CONNECTING:
- if (previous == SS_CONNECTING)
- res = -EALREADY;
- if (!timeout)
+ /* fall thru' */
+ case TIPC_CONNECTING:
+ if (!timeout) {
+ if (previous == TIPC_CONNECTING)
+ res = -EALREADY;
goto exit;
+ }
timeout = msecs_to_jiffies(timeout);
/* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
res = tipc_wait_for_connect(sock, &timeout);
break;
- case SS_CONNECTED:
+ case TIPC_ESTABLISHED:
res = -EISCONN;
break;
default:
res = -EINVAL;
- break;
}
+
exit:
release_sock(sk);
return res;
@@ -2008,15 +2010,9 @@ static int tipc_listen(struct socket *sock, int len)
int res;
lock_sock(sk);
-
- if (sock->state != SS_UNCONNECTED)
- res = -EINVAL;
- else {
- sock->state = SS_LISTENING;
- res = 0;
- }
-
+ res = tipc_set_sk_state(sk, TIPC_LISTEN);
release_sock(sk);
+
return res;
}
@@ -2042,9 +2038,6 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
err = 0;
if (!skb_queue_empty(&sk->sk_receive_queue))
break;
- err = -EINVAL;
- if (sock->state != SS_LISTENING)
- break;
err = -EAGAIN;
if (!timeo)
break;
@@ -2075,7 +2068,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
lock_sock(sk);
- if (sock->state != SS_LISTENING) {
+ if (sk->sk_state != TIPC_LISTEN) {
res = -EINVAL;
goto exit;
}
@@ -2086,7 +2079,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
buf = skb_peek(&sk->sk_receive_queue);
- res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 1);
+ res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 0);
if (res)
goto exit;
security_sk_clone(sock->sk, new_sock->sk);
@@ -2106,7 +2099,6 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
/* Connect new socket to it's peer */
tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
- new_sock->state = SS_CONNECTED;
tsk_set_importance(new_tsock, msg_importance(msg));
if (msg_named(msg)) {
@@ -2146,13 +2138,6 @@ exit:
static int tipc_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
- struct tipc_sock *tsk = tipc_sk(sk);
- struct sk_buff *skb;
- u32 dnode = tsk_peer_node(tsk);
- u32 dport = tsk_peer_port(tsk);
- u32 onode = tipc_own_addr(net);
- u32 oport = tsk->portid;
int res;
if (how != SHUT_RDWR)
@@ -2160,45 +2145,17 @@ static int tipc_shutdown(struct socket *sock, int how)
lock_sock(sk);
- switch (sock->state) {
- case SS_CONNECTING:
- case SS_CONNECTED:
-
-restart:
- dnode = tsk_peer_node(tsk);
-
- /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
- skb = __skb_dequeue(&sk->sk_receive_queue);
- if (skb) {
- if (TIPC_SKB_CB(skb)->handle != NULL) {
- kfree_skb(skb);
- goto restart;
- }
- tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN);
- } else {
- skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
- TIPC_CONN_MSG, SHORT_H_SIZE,
- 0, dnode, onode, dport, oport,
- TIPC_CONN_SHUTDOWN);
- if (skb)
- tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
- }
- tsk->connected = 0;
- sock->state = SS_DISCONNECTING;
- tipc_node_remove_conn(net, dnode, tsk->portid);
- /* fall through */
-
- case SS_DISCONNECTING:
+ __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN);
+ sk->sk_shutdown = SEND_SHUTDOWN;
+ if (sk->sk_state == TIPC_DISCONNECTING) {
/* Discard any unreceived messages */
__skb_queue_purge(&sk->sk_receive_queue);
/* Wake up anyone sleeping in poll */
sk->sk_state_change(sk);
res = 0;
- break;
-
- default:
+ } else {
res = -ENOTCONN;
}
@@ -2215,17 +2172,16 @@ static void tipc_sk_timeout(unsigned long data)
u32 own_node = tsk_own_node(tsk);
bh_lock_sock(sk);
- if (!tsk->connected) {
+ if (!tipc_sk_connected(sk)) {
bh_unlock_sock(sk);
goto exit;
}
peer_port = tsk_peer_port(tsk);
peer_node = tsk_peer_node(tsk);
- if (tsk->probing_state == TIPC_CONN_PROBING) {
+ if (tsk->probe_unacked) {
if (!sock_owned_by_user(sk)) {
- sk->sk_socket->state = SS_DISCONNECTING;
- tsk->connected = 0;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
tsk_peer_port(tsk));
sk->sk_state_change(sk);
@@ -2234,13 +2190,15 @@ static void tipc_sk_timeout(unsigned long data)
sk_reset_timer(sk, &sk->sk_timer, (HZ / 20));
}
- } else {
- skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
- INT_H_SIZE, 0, peer_node, own_node,
- peer_port, tsk->portid, TIPC_OK);
- tsk->probing_state = TIPC_CONN_PROBING;
- sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
+ bh_unlock_sock(sk);
+ goto exit;
}
+
+ skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
+ INT_H_SIZE, 0, peer_node, own_node,
+ peer_port, tsk->portid, TIPC_OK);
+ tsk->probe_unacked = true;
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL);
bh_unlock_sock(sk);
if (skb)
tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
@@ -2251,11 +2209,12 @@ exit:
static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
struct tipc_name_seq const *seq)
{
- struct net *net = sock_net(&tsk->sk);
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
struct publication *publ;
u32 key;
- if (tsk->connected)
+ if (tipc_sk_connected(sk))
return -EINVAL;
key = tsk->portid + tsk->pub_count + 1;
if (key == tsk->portid)
@@ -2713,6 +2672,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
struct nlattr *attrs;
struct net *net = sock_net(skb->sk);
struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sock *sk = &tsk->sk;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET);
@@ -2727,7 +2687,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
goto attr_msg_cancel;
- if (tsk->connected) {
+ if (tipc_sk_connected(sk)) {
err = __tipc_nl_add_sk_con(skb, tsk);
if (err)
goto attr_msg_cancel;
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 0dd02244e21d..9d94e65d0894 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -54,6 +54,8 @@ struct tipc_subscriber {
static void tipc_subscrp_delete(struct tipc_subscription *sub);
static void tipc_subscrb_put(struct tipc_subscriber *subscriber);
+static void tipc_subscrp_put(struct tipc_subscription *subscription);
+static void tipc_subscrp_get(struct tipc_subscription *subscription);
/**
* htohl - convert value to endianness used by destination
@@ -123,6 +125,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
{
struct tipc_name_seq seq;
+ tipc_subscrp_get(sub);
tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
return;
@@ -132,30 +135,23 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
node);
+ tipc_subscrp_put(sub);
}
static void tipc_subscrp_timeout(unsigned long data)
{
struct tipc_subscription *sub = (struct tipc_subscription *)data;
- struct tipc_subscriber *subscriber = sub->subscriber;
/* Notify subscriber of timeout */
tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
TIPC_SUBSCR_TIMEOUT, 0, 0);
- spin_lock_bh(&subscriber->lock);
- tipc_subscrp_delete(sub);
- spin_unlock_bh(&subscriber->lock);
-
- tipc_subscrb_put(subscriber);
+ tipc_subscrp_put(sub);
}
static void tipc_subscrb_kref_release(struct kref *kref)
{
- struct tipc_subscriber *subcriber = container_of(kref,
- struct tipc_subscriber, kref);
-
- kfree(subcriber);
+ kfree(container_of(kref,struct tipc_subscriber, kref));
}
static void tipc_subscrb_put(struct tipc_subscriber *subscriber)
@@ -168,6 +164,59 @@ static void tipc_subscrb_get(struct tipc_subscriber *subscriber)
kref_get(&subscriber->kref);
}
+static void tipc_subscrp_kref_release(struct kref *kref)
+{
+ struct tipc_subscription *sub = container_of(kref,
+ struct tipc_subscription,
+ kref);
+ struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+ struct tipc_subscriber *subscriber = sub->subscriber;
+
+ spin_lock_bh(&subscriber->lock);
+ tipc_nametbl_unsubscribe(sub);
+ list_del(&sub->subscrp_list);
+ atomic_dec(&tn->subscription_count);
+ spin_unlock_bh(&subscriber->lock);
+ kfree(sub);
+ tipc_subscrb_put(subscriber);
+}
+
+static void tipc_subscrp_put(struct tipc_subscription *subscription)
+{
+ kref_put(&subscription->kref, tipc_subscrp_kref_release);
+}
+
+static void tipc_subscrp_get(struct tipc_subscription *subscription)
+{
+ kref_get(&subscription->kref);
+}
+
+/* tipc_subscrb_subscrp_delete - delete a specific subscription or all
+ * subscriptions for a given subscriber.
+ */
+static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber,
+ struct tipc_subscr *s)
+{
+ struct list_head *subscription_list = &subscriber->subscrp_list;
+ struct tipc_subscription *sub, *temp;
+
+ spin_lock_bh(&subscriber->lock);
+ list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) {
+ if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr)))
+ continue;
+
+ tipc_subscrp_get(sub);
+ spin_unlock_bh(&subscriber->lock);
+ tipc_subscrp_delete(sub);
+ tipc_subscrp_put(sub);
+ spin_lock_bh(&subscriber->lock);
+
+ if (s)
+ break;
+ }
+ spin_unlock_bh(&subscriber->lock);
+}
+
static struct tipc_subscriber *tipc_subscrb_create(int conid)
{
struct tipc_subscriber *subscriber;
@@ -177,8 +226,8 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
pr_warn("Subscriber rejected, no memory\n");
return NULL;
}
- kref_init(&subscriber->kref);
INIT_LIST_HEAD(&subscriber->subscrp_list);
+ kref_init(&subscriber->kref);
subscriber->conid = conid;
spin_lock_init(&subscriber->lock);
@@ -187,55 +236,22 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
{
- struct tipc_subscription *sub, *temp;
- u32 timeout;
-
- spin_lock_bh(&subscriber->lock);
- /* Destroy any existing subscriptions for subscriber */
- list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
- subscrp_list) {
- timeout = htohl(sub->evt.s.timeout, sub->swap);
- if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) {
- tipc_subscrp_delete(sub);
- tipc_subscrb_put(subscriber);
- }
- }
- spin_unlock_bh(&subscriber->lock);
-
+ tipc_subscrb_subscrp_delete(subscriber, NULL);
tipc_subscrb_put(subscriber);
}
static void tipc_subscrp_delete(struct tipc_subscription *sub)
{
- struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+ u32 timeout = htohl(sub->evt.s.timeout, sub->swap);
- tipc_nametbl_unsubscribe(sub);
- list_del(&sub->subscrp_list);
- kfree(sub);
- atomic_dec(&tn->subscription_count);
+ if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer))
+ tipc_subscrp_put(sub);
}
static void tipc_subscrp_cancel(struct tipc_subscr *s,
struct tipc_subscriber *subscriber)
{
- struct tipc_subscription *sub, *temp;
- u32 timeout;
-
- spin_lock_bh(&subscriber->lock);
- /* Find first matching subscription, exit if not found */
- list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
- subscrp_list) {
- if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
- timeout = htohl(sub->evt.s.timeout, sub->swap);
- if ((timeout == TIPC_WAIT_FOREVER) ||
- del_timer(&sub->timer)) {
- tipc_subscrp_delete(sub);
- tipc_subscrb_put(subscriber);
- }
- break;
- }
- }
- spin_unlock_bh(&subscriber->lock);
+ tipc_subscrb_subscrp_delete(subscriber, s);
}
static struct tipc_subscription *tipc_subscrp_create(struct net *net,
@@ -272,6 +288,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
sub->swap = swap;
memcpy(&sub->evt.s, s, sizeof(*s));
atomic_inc(&tn->subscription_count);
+ kref_init(&sub->kref);
return sub;
}
@@ -288,17 +305,16 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
spin_lock_bh(&subscriber->lock);
list_add(&sub->subscrp_list, &subscriber->subscrp_list);
- tipc_subscrb_get(subscriber);
sub->subscriber = subscriber;
tipc_nametbl_subscribe(sub);
+ tipc_subscrb_get(subscriber);
spin_unlock_bh(&subscriber->lock);
+ setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
timeout = htohl(sub->evt.s.timeout, swap);
- if (timeout == TIPC_WAIT_FOREVER)
- return;
- setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
- mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
+ if (timeout != TIPC_WAIT_FOREVER)
+ mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
}
/* Handle one termination request for the subscriber */
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index be60103082c9..ffdc214c117a 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -57,6 +57,7 @@ struct tipc_subscriber;
* @evt: template for events generated by subscription
*/
struct tipc_subscription {
+ struct kref kref;
struct tipc_subscriber *subscriber;
struct net *net;
struct timer_list timer;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index ae7e14cae085..b58dc95f3d35 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -49,6 +49,7 @@
#include "core.h"
#include "bearer.h"
#include "netlink.h"
+#include "msg.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
@@ -70,6 +71,13 @@ struct udp_media_addr {
};
};
+/* struct udp_replicast - container for UDP remote addresses */
+struct udp_replicast {
+ struct udp_media_addr addr;
+ struct rcu_head rcu;
+ struct list_head list;
+};
+
/**
* struct udp_bearer - ip/udp bearer data structure
* @bearer: associated generic tipc bearer
@@ -82,8 +90,20 @@ struct udp_bearer {
struct socket *ubsock;
u32 ifindex;
struct work_struct work;
+ struct udp_replicast rcast;
};
+static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr)
+{
+ if (ntohs(addr->proto) == ETH_P_IP)
+ return ipv4_is_multicast(addr->ipv4.s_addr);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ return ipv6_addr_is_multicast(&addr->ipv6);
+#endif
+ return 0;
+}
+
/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */
static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
struct udp_media_addr *ua)
@@ -91,15 +111,9 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
memset(addr, 0, sizeof(struct tipc_media_addr));
addr->media_id = TIPC_MEDIA_TYPE_UDP;
memcpy(addr->value, ua, sizeof(struct udp_media_addr));
- if (ntohs(ua->proto) == ETH_P_IP) {
- if (ipv4_is_multicast(ua->ipv4.s_addr))
- addr->broadcast = 1;
- } else if (ntohs(ua->proto) == ETH_P_IPV6) {
- if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST)
- addr->broadcast = 1;
- } else {
- pr_err("Invalid UDP media address\n");
- }
+
+ if (tipc_udp_is_mcast_addr(ua))
+ addr->broadcast = 1;
}
/* tipc_udp_addr2str - convert ip/udp address to string */
@@ -140,28 +154,13 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a)
}
/* tipc_send_msg - enqueue a send request */
-static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
- struct tipc_bearer *b,
- struct tipc_media_addr *dest)
+static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
+ struct udp_bearer *ub, struct udp_media_addr *src,
+ struct udp_media_addr *dst)
{
int ttl, err = 0;
- struct udp_bearer *ub;
- struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value;
- struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
struct rtable *rt;
- if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
- err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
- if (err)
- goto tx_error;
- }
-
- skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
- ub = rcu_dereference_rtnl(b->media_ptr);
- if (!ub) {
- err = -ENODEV;
- goto tx_error;
- }
if (dst->proto == htons(ETH_P_IP)) {
struct flowi4 fl = {
.daddr = dst->ipv4.s_addr,
@@ -207,29 +206,178 @@ tx_error:
return err;
}
+static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *addr)
+{
+ struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
+ struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value;
+ struct udp_replicast *rcast;
+ struct udp_bearer *ub;
+ int err = 0;
+
+ if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
+ err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
+ if (err)
+ goto out;
+ }
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (!addr->broadcast || list_empty(&ub->rcast.list))
+ return tipc_udp_xmit(net, skb, ub, src, dst);
+
+ /* Replicast, send an skb to each configured IP address */
+ list_for_each_entry_rcu(rcast, &ub->rcast.list, list) {
+ struct sk_buff *_skb;
+
+ _skb = pskb_copy(skb, GFP_ATOMIC);
+ if (!_skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr);
+ if (err) {
+ kfree_skb(_skb);
+ goto out;
+ }
+ }
+ err = 0;
+out:
+ kfree_skb(skb);
+ return err;
+}
+
+static bool tipc_udp_is_known_peer(struct tipc_bearer *b,
+ struct udp_media_addr *addr)
+{
+ struct udp_replicast *rcast, *tmp;
+ struct udp_bearer *ub;
+
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub) {
+ pr_err_ratelimited("UDP bearer instance not found\n");
+ return false;
+ }
+
+ list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
+ if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr)))
+ return true;
+ }
+
+ return false;
+}
+
+static int tipc_udp_rcast_add(struct tipc_bearer *b,
+ struct udp_media_addr *addr)
+{
+ struct udp_replicast *rcast;
+ struct udp_bearer *ub;
+
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub)
+ return -ENODEV;
+
+ rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC);
+ if (!rcast)
+ return -ENOMEM;
+
+ memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr));
+
+ if (ntohs(addr->proto) == ETH_P_IP)
+ pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (ntohs(addr->proto) == ETH_P_IPV6)
+ pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
+#endif
+
+ list_add_rcu(&rcast->list, &ub->rcast.list);
+ return 0;
+}
+
+static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb)
+{
+ struct udp_media_addr src = {0};
+ struct udp_media_addr *dst;
+
+ dst = (struct udp_media_addr *)&b->bcast_addr.value;
+ if (tipc_udp_is_mcast_addr(dst))
+ return 0;
+
+ src.port = udp_hdr(skb)->source;
+
+ if (ip_hdr(skb)->version == 4) {
+ struct iphdr *iphdr = ip_hdr(skb);
+
+ src.proto = htons(ETH_P_IP);
+ src.ipv4.s_addr = iphdr->saddr;
+ if (ipv4_is_multicast(iphdr->daddr))
+ return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (ip_hdr(skb)->version == 6) {
+ struct ipv6hdr *iphdr = ipv6_hdr(skb);
+
+ src.proto = htons(ETH_P_IPV6);
+ src.ipv6 = iphdr->saddr;
+ if (ipv6_addr_is_multicast(&iphdr->daddr))
+ return 0;
+#endif
+ } else {
+ return 0;
+ }
+
+ if (likely(tipc_udp_is_known_peer(b, &src)))
+ return 0;
+
+ return tipc_udp_rcast_add(b, &src);
+}
+
/* tipc_udp_recv - read data from bearer socket */
static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct udp_bearer *ub;
struct tipc_bearer *b;
+ struct tipc_msg *hdr;
+ int err;
ub = rcu_dereference_sk_user_data(sk);
if (!ub) {
pr_err_ratelimited("Failed to get UDP bearer reference");
- kfree_skb(skb);
- return 0;
+ goto out;
}
-
skb_pull(skb, sizeof(struct udphdr));
+ hdr = buf_msg(skb);
+
rcu_read_lock();
b = rcu_dereference_rtnl(ub->bearer);
+ if (!b)
+ goto rcu_out;
- if (b) {
+ if (b && test_bit(0, &b->up)) {
tipc_rcv(sock_net(sk), skb, b);
rcu_read_unlock();
return 0;
}
+
+ if (unlikely(msg_user(hdr) == LINK_CONFIG)) {
+ err = tipc_udp_rcast_disc(b, skb);
+ if (err)
+ goto rcu_out;
+ }
+
+ tipc_rcv(sock_net(sk), skb, b);
rcu_read_unlock();
+ return 0;
+
+rcu_out:
+ rcu_read_unlock();
+out:
kfree_skb(skb);
return 0;
}
@@ -241,15 +389,11 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
struct sock *sk = ub->ubsock->sk;
if (ntohs(remote->proto) == ETH_P_IP) {
- if (!ipv4_is_multicast(remote->ipv4.s_addr))
- return 0;
mreqn.imr_multiaddr = remote->ipv4;
mreqn.imr_ifindex = ub->ifindex;
err = ip_mc_join_group(sk, &mreqn);
#if IS_ENABLED(CONFIG_IPV6)
} else {
- if (!ipv6_addr_is_multicast(&remote->ipv6))
- return 0;
err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex,
&remote->ipv6);
#endif
@@ -257,75 +401,236 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
return err;
}
-/**
- * parse_options - build local/remote addresses from configuration
- * @attrs: netlink config data
- * @ub: UDP bearer instance
- * @local: local bearer IP address/port
- * @remote: peer or multicast IP/port
- */
-static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
- struct udp_media_addr *local,
- struct udp_media_addr *remote)
+static int __tipc_nl_add_udp_addr(struct sk_buff *skb,
+ struct udp_media_addr *addr, int nla_t)
{
- struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
- struct sockaddr_storage sa_local, sa_remote;
+ if (ntohs(addr->proto) == ETH_P_IP) {
+ struct sockaddr_in ip4;
- if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
- goto err;
- if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX,
- attrs[TIPC_NLA_BEARER_UDP_OPTS],
- tipc_nl_udp_policy))
- goto err;
- if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) {
- nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL],
- sizeof(sa_local));
- nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE],
- sizeof(sa_remote));
+ memset(&ip4, 0, sizeof(ip4));
+ ip4.sin_family = AF_INET;
+ ip4.sin_port = addr->port;
+ ip4.sin_addr.s_addr = addr->ipv4.s_addr;
+ if (nla_put(skb, nla_t, sizeof(ip4), &ip4))
+ return -EMSGSIZE;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (ntohs(addr->proto) == ETH_P_IPV6) {
+ struct sockaddr_in6 ip6;
+
+ memset(&ip6, 0, sizeof(ip6));
+ ip6.sin6_family = AF_INET6;
+ ip6.sin6_port = addr->port;
+ memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr));
+ if (nla_put(skb, nla_t, sizeof(ip6), &ip6))
+ return -EMSGSIZE;
+#endif
+ }
+
+ return 0;
+}
+
+int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ u32 bid = cb->args[0];
+ u32 skip_cnt = cb->args[1];
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct udp_replicast *rcast, *tmp;
+ struct tipc_bearer *b;
+ struct udp_bearer *ub;
+ void *hdr;
+ int err;
+ int i;
+
+ if (!bid && !skip_cnt) {
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1];
+ struct nlattr **attrs;
+ char *bname;
+
+ err = tipc_nlmsg_parse(cb->nlh, &attrs);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
+
+ err = nla_parse_nested(battrs, TIPC_NLA_BEARER_MAX,
+ attrs[TIPC_NLA_BEARER],
+ tipc_nl_bearer_policy);
+ if (err)
+ return err;
+
+ if (!battrs[TIPC_NLA_BEARER_NAME])
+ return -EINVAL;
+
+ bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]);
+
+ rtnl_lock();
+ b = tipc_bearer_find(net, bname);
+ if (!b) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+ bid = b->identity;
} else {
-err:
- pr_err("Invalid UDP bearer configuration");
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ rtnl_lock();
+ b = rtnl_dereference(tn->bearer_list[bid]);
+ if (!b) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+ }
+
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub) {
+ rtnl_unlock();
return -EINVAL;
}
- if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) {
- struct sockaddr_in *ip4;
-
- ip4 = (struct sockaddr_in *)&sa_local;
- local->proto = htons(ETH_P_IP);
- local->port = ip4->sin_port;
- local->ipv4.s_addr = ip4->sin_addr.s_addr;
-
- ip4 = (struct sockaddr_in *)&sa_remote;
- remote->proto = htons(ETH_P_IP);
- remote->port = ip4->sin_port;
- remote->ipv4.s_addr = ip4->sin_addr.s_addr;
+
+ i = 0;
+ list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
+ if (i < skip_cnt)
+ goto count;
+
+ hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq,
+ &tipc_genl_family, NLM_F_MULTI,
+ TIPC_NL_BEARER_GET);
+ if (!hdr)
+ goto done;
+
+ err = __tipc_nl_add_udp_addr(skb, &rcast->addr,
+ TIPC_NLA_UDP_REMOTE);
+ if (err) {
+ genlmsg_cancel(skb, hdr);
+ goto done;
+ }
+ genlmsg_end(skb, hdr);
+count:
+ i++;
+ }
+done:
+ rtnl_unlock();
+ cb->args[0] = bid;
+ cb->args[1] = i;
+
+ return skb->len;
+}
+
+int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b)
+{
+ struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
+ struct udp_media_addr *dst;
+ struct udp_bearer *ub;
+ struct nlattr *nest;
+
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub)
+ return -ENODEV;
+
+ nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS);
+ if (!nest)
+ goto msg_full;
+
+ if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL))
+ goto msg_full;
+
+ dst = (struct udp_media_addr *)&b->bcast_addr.value;
+ if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE))
+ goto msg_full;
+
+ if (!list_empty(&ub->rcast.list)) {
+ if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP))
+ goto msg_full;
+ }
+
+ nla_nest_end(msg->skb, nest);
+ return 0;
+msg_full:
+ nla_nest_cancel(msg->skb, nest);
+ return -EMSGSIZE;
+}
+
+/**
+ * tipc_parse_udp_addr - build udp media address from netlink data
+ * @nlattr: netlink attribute containing sockaddr storage aligned address
+ * @addr: tipc media address to fill with address, port and protocol type
+ * @scope_id: IPv6 scope id pointer, not NULL indicates it's required
+ */
+
+static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr,
+ u32 *scope_id)
+{
+ struct sockaddr_storage sa;
+
+ nla_memcpy(&sa, nla, sizeof(sa));
+ if (sa.ss_family == AF_INET) {
+ struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa;
+
+ addr->proto = htons(ETH_P_IP);
+ addr->port = ip4->sin_port;
+ addr->ipv4.s_addr = ip4->sin_addr.s_addr;
return 0;
#if IS_ENABLED(CONFIG_IPV6)
- } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) {
- int atype;
- struct sockaddr_in6 *ip6;
+ } else if (sa.ss_family == AF_INET6) {
+ struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa;
- ip6 = (struct sockaddr_in6 *)&sa_local;
- atype = ipv6_addr_type(&ip6->sin6_addr);
- if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id)
- return -EINVAL;
+ addr->proto = htons(ETH_P_IPV6);
+ addr->port = ip6->sin6_port;
+ memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
- local->proto = htons(ETH_P_IPV6);
- local->port = ip6->sin6_port;
- memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
- ub->ifindex = ip6->sin6_scope_id;
+ /* Scope ID is only interesting for local addresses */
+ if (scope_id) {
+ int atype;
+
+ atype = ipv6_addr_type(&ip6->sin6_addr);
+ if (__ipv6_addr_needs_scope_id(atype) &&
+ !ip6->sin6_scope_id) {
+ return -EINVAL;
+ }
+
+ *scope_id = ip6->sin6_scope_id ? : 0;
+ }
- ip6 = (struct sockaddr_in6 *)&sa_remote;
- remote->proto = htons(ETH_P_IPV6);
- remote->port = ip6->sin6_port;
- memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
return 0;
#endif
}
return -EADDRNOTAVAIL;
}
+int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr)
+{
+ int err;
+ struct udp_media_addr addr = {0};
+ struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
+ struct udp_media_addr *dst;
+
+ if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy))
+ return -EINVAL;
+
+ if (!opts[TIPC_NLA_UDP_REMOTE])
+ return -EINVAL;
+
+ err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL);
+ if (err)
+ return err;
+
+ dst = (struct udp_media_addr *)&b->bcast_addr.value;
+ if (tipc_udp_is_mcast_addr(dst)) {
+ pr_err("Can't add remote ip to TIPC UDP multicast bearer\n");
+ return -EINVAL;
+ }
+
+ if (tipc_udp_is_known_peer(b, &addr))
+ return 0;
+
+ return tipc_udp_rcast_add(b, &addr);
+}
+
/**
* tipc_udp_enable - callback to create a new udp bearer instance
* @net: network namespace
@@ -340,18 +645,38 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
{
int err = -EINVAL;
struct udp_bearer *ub;
- struct udp_media_addr *remote;
+ struct udp_media_addr remote = {0};
struct udp_media_addr local = {0};
struct udp_port_cfg udp_conf = {0};
struct udp_tunnel_sock_cfg tuncfg = {NULL};
+ struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
if (!ub)
return -ENOMEM;
- remote = (struct udp_media_addr *)&b->bcast_addr.value;
- memset(remote, 0, sizeof(struct udp_media_addr));
- err = parse_options(attrs, ub, &local, remote);
+ INIT_LIST_HEAD(&ub->rcast.list);
+
+ if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
+ goto err;
+
+ if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX,
+ attrs[TIPC_NLA_BEARER_UDP_OPTS],
+ tipc_nl_udp_policy))
+ goto err;
+
+ if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) {
+ pr_err("Invalid UDP bearer configuration");
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local,
+ &ub->ifindex);
+ if (err)
+ goto err;
+
+ err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL);
if (err)
goto err;
@@ -372,6 +697,11 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
udp_conf.use_udp_checksums = false;
ub->ifindex = dev->ifindex;
+ if (tipc_mtu_bad(dev, sizeof(struct iphdr) +
+ sizeof(struct udphdr))) {
+ err = -EINVAL;
+ goto err;
+ }
b->mtu = dev->mtu - sizeof(struct iphdr)
- sizeof(struct udphdr);
#if IS_ENABLED(CONFIG_IPV6)
@@ -396,9 +726,18 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
tuncfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
- err = enable_mcast(ub, remote);
+ /**
+ * The bcast media address port is used for all peers and the ip
+ * is used if it's a multicast address.
+ */
+ memcpy(&b->bcast_addr.value, &remote, sizeof(remote));
+ if (tipc_udp_is_mcast_addr(&remote))
+ err = enable_mcast(ub, &remote);
+ else
+ err = tipc_udp_rcast_add(b, &remote);
if (err)
goto err;
+
return 0;
err:
if (ub->ubsock)
@@ -411,6 +750,12 @@ err:
static void cleanup_bearer(struct work_struct *work)
{
struct udp_bearer *ub = container_of(work, struct udp_bearer, work);
+ struct udp_replicast *rcast, *tmp;
+
+ list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
+ list_del_rcu(&rcast->list);
+ kfree_rcu(rcast, rcu);
+ }
if (ub->ubsock)
udp_tunnel_sock_release(ub->ubsock);
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
new file mode 100644
index 000000000000..281bbae87726
--- /dev/null
+++ b/net/tipc/udp_media.h
@@ -0,0 +1,46 @@
+/*
+ * net/tipc/udp_media.h: Include file for UDP bearer media
+ *
+ * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_TIPC_MEDIA_UDP
+#ifndef _TIPC_UDP_MEDIA_H
+#define _TIPC_UDP_MEDIA_H
+
+int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
+int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
+int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
+
+#endif
+#endif
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 8309687a56b0..cef79873b09d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -100,7 +100,7 @@
#include <linux/in.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/net_namespace.h>
@@ -315,7 +315,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
- if (dentry && d_real_inode(dentry) == i) {
+ if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
goto found;
}
@@ -913,7 +913,7 @@ static struct sock *unix_find_other(struct net *net,
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
- inode = d_real_inode(path.dentry);
+ inode = d_backing_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
@@ -995,6 +995,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
+ struct path path = { NULL, NULL };
err = -EINVAL;
if (sunaddr->sun_family != AF_UNIX)
@@ -1010,9 +1011,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
addr_len = err;
+ if (sun_path[0]) {
+ umode_t mode = S_IFSOCK |
+ (SOCK_INODE(sock)->i_mode & ~current_umask());
+ err = unix_mknod(sun_path, mode, &path);
+ if (err) {
+ if (err == -EEXIST)
+ err = -EADDRINUSE;
+ goto out;
+ }
+ }
+
err = mutex_lock_interruptible(&u->bindlock);
if (err)
- goto out;
+ goto out_put;
err = -EINVAL;
if (u->addr)
@@ -1029,18 +1041,8 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
atomic_set(&addr->refcnt, 1);
if (sun_path[0]) {
- struct path path;
- umode_t mode = S_IFSOCK |
- (SOCK_INODE(sock)->i_mode & ~current_umask());
- err = unix_mknod(sun_path, mode, &path);
- if (err) {
- if (err == -EEXIST)
- err = -EADDRINUSE;
- unix_release_addr(addr);
- goto out_up;
- }
addr->hash = UNIX_HASH_SIZE;
- hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+ hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
u->path = path;
list = &unix_socket_table[hash];
@@ -1065,6 +1067,9 @@ out_unlock:
spin_unlock(&unix_table_lock);
out_up:
mutex_unlock(&u->bindlock);
+out_put:
+ if (err)
+ path_put(&path);
out:
return err;
}
@@ -2113,8 +2118,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
mutex_lock(&u->iolock);
skip = sk_peek_offset(sk, flags);
- skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
+ &err, &last);
if (skb)
break;
@@ -2199,7 +2204,8 @@ out:
* Sleep until more data has arrived. But check for races..
*/
static long unix_stream_data_wait(struct sock *sk, long timeo,
- struct sk_buff *last, unsigned int last_len)
+ struct sk_buff *last, unsigned int last_len,
+ bool freezable)
{
struct sk_buff *tail;
DEFINE_WAIT(wait);
@@ -2220,7 +2226,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
unix_state_unlock(sk);
- timeo = freezable_schedule_timeout(timeo);
+ if (freezable)
+ timeo = freezable_schedule_timeout(timeo);
+ else
+ timeo = schedule_timeout(timeo);
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD))
@@ -2250,7 +2259,8 @@ struct unix_stream_read_state {
unsigned int splice_flags;
};
-static int unix_stream_read_generic(struct unix_stream_read_state *state)
+static int unix_stream_read_generic(struct unix_stream_read_state *state,
+ bool freezable)
{
struct scm_cookie scm;
struct socket *sock = state->socket;
@@ -2330,7 +2340,7 @@ again:
mutex_unlock(&u->iolock);
timeo = unix_stream_data_wait(sk, timeo, last,
- last_len);
+ last_len, freezable);
if (signal_pending(current)) {
err = sock_intr_errno(timeo);
@@ -2472,21 +2482,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
.flags = flags
};
- return unix_stream_read_generic(&state);
-}
-
-static ssize_t skb_unix_socket_splice(struct sock *sk,
- struct pipe_inode_info *pipe,
- struct splice_pipe_desc *spd)
-{
- int ret;
- struct unix_sock *u = unix_sk(sk);
-
- mutex_unlock(&u->iolock);
- ret = splice_to_pipe(pipe, spd);
- mutex_lock(&u->iolock);
-
- return ret;
+ return unix_stream_read_generic(&state, true);
}
static int unix_stream_splice_actor(struct sk_buff *skb,
@@ -2495,8 +2491,7 @@ static int unix_stream_splice_actor(struct sk_buff *skb,
{
return skb_splice_bits(skb, state->socket->sk,
UNIXCB(skb).consumed + skip,
- state->pipe, chunk, state->splice_flags,
- skb_unix_socket_splice);
+ state->pipe, chunk, state->splice_flags);
}
static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
@@ -2518,7 +2513,7 @@ static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
flags & SPLICE_F_NONBLOCK)
state.flags = MSG_DONTWAIT;
- return unix_stream_read_generic(&state);
+ return unix_stream_read_generic(&state, false);
}
static int unix_shutdown(struct socket *sock, int mode)
@@ -2827,7 +2822,8 @@ static int unix_seq_show(struct seq_file *seq, void *v)
i++;
}
for ( ; i < len; i++)
- seq_putc(seq, u->addr->name->sun_path[i]);
+ seq_putc(seq, u->addr->name->sun_path[i] ?:
+ '@');
}
unix_state_unlock(s);
seq_putc(seq, '\n');
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 936d7eee62d0..6788264acc63 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -44,6 +44,10 @@ struct virtio_vsock {
spinlock_t send_pkt_list_lock;
struct list_head send_pkt_list;
+ struct work_struct loopback_work;
+ spinlock_t loopback_list_lock; /* protects loopback_list */
+ struct list_head loopback_list;
+
atomic_t queued_replies;
/* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
@@ -74,6 +78,42 @@ static u32 virtio_transport_get_local_cid(void)
return vsock->guest_cid;
}
+static void virtio_transport_loopback_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, loopback_work);
+ LIST_HEAD(pkts);
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_splice_init(&vsock->loopback_list, &pkts);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ mutex_lock(&vsock->rx_lock);
+ while (!list_empty(&pkts)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+
+ virtio_transport_recv_pkt(pkt);
+ }
+ mutex_unlock(&vsock->rx_lock);
+}
+
+static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
+ struct virtio_vsock_pkt *pkt)
+{
+ int len = pkt->len;
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_add_tail(&pkt->list, &vsock->loopback_list);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->loopback_work);
+
+ return len;
+}
+
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -159,6 +199,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return -ENODEV;
}
+ if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid)
+ return virtio_transport_send_pkt_loopback(vsock, pkt);
+
if (pkt->reply)
atomic_inc(&vsock->queued_replies);
@@ -336,7 +379,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
{
struct virtio_device *vdev = vsock->vdev;
- u64 guest_cid;
+ __le64 guest_cid;
vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
&guest_cid, sizeof(guest_cid));
@@ -510,10 +553,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
mutex_init(&vsock->event_lock);
spin_lock_init(&vsock->send_pkt_list_lock);
INIT_LIST_HEAD(&vsock->send_pkt_list);
+ spin_lock_init(&vsock->loopback_list_lock);
+ INIT_LIST_HEAD(&vsock->loopback_list);
INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
INIT_WORK(&vsock->event_work, virtio_transport_event_work);
INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+ INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
mutex_lock(&vsock->rx_lock);
virtio_vsock_rx_fill(vsock);
@@ -539,6 +585,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
struct virtio_vsock *vsock = vdev->priv;
struct virtio_vsock_pkt *pkt;
+ flush_work(&vsock->loopback_work);
flush_work(&vsock->rx_work);
flush_work(&vsock->tx_work);
flush_work(&vsock->event_work);
@@ -565,6 +612,15 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
}
spin_unlock_bh(&vsock->send_pkt_list_lock);
+ spin_lock_bh(&vsock->loopback_list_lock);
+ while (!list_empty(&vsock->loopback_list)) {
+ pkt = list_first_entry(&vsock->loopback_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
mutex_lock(&the_virtio_vsock_mutex);
the_virtio_vsock = NULL;
vsock_core_exit();
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a53b3a16b4f1..849c4ad0411e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -32,7 +32,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
return container_of(t, struct virtio_transport, transport);
}
-struct virtio_vsock_pkt *
+static struct virtio_vsock_pkt *
virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
size_t len,
u32 src_cid,
@@ -82,7 +82,6 @@ out_pkt:
kfree(pkt);
return NULL;
}
-EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info *info)
@@ -606,9 +605,9 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
return 0;
pkt = virtio_transport_alloc_pkt(&info, 0,
- le32_to_cpu(pkt->hdr.dst_cid),
+ le64_to_cpu(pkt->hdr.dst_cid),
le32_to_cpu(pkt->hdr.dst_port),
- le32_to_cpu(pkt->hdr.src_cid),
+ le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
if (!pkt)
return -ENOMEM;
@@ -619,17 +618,17 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
static void virtio_transport_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
if (sk_wait_event(sk, &timeout,
- sock_flag(sk, SOCK_DONE)))
+ sock_flag(sk, SOCK_DONE), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
@@ -823,7 +822,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RESPONSE,
.type = VIRTIO_VSOCK_TYPE_STREAM,
- .remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+ .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
.remote_port = le32_to_cpu(pkt->hdr.src_port),
.reply = true,
};
@@ -863,9 +862,9 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
child->sk_state = SS_CONNECTED;
vchild = vsock_sk(child);
- vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+ vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
le32_to_cpu(pkt->hdr.dst_port));
- vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+ vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
vsock_insert_connected(vchild);
@@ -904,9 +903,9 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
struct sock *sk;
bool space_available;
- vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+ vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
- vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+ vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
le32_to_cpu(pkt->hdr.dst_port));
trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
index fd8cf0214d51..1406db4d97d1 100644
--- a/net/vmw_vsock/vmci_transport_notify.c
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -662,19 +662,19 @@ static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
/* Socket control packet based operations. */
const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
- vmci_transport_notify_pkt_socket_init,
- vmci_transport_notify_pkt_socket_destruct,
- vmci_transport_notify_pkt_poll_in,
- vmci_transport_notify_pkt_poll_out,
- vmci_transport_notify_pkt_handle_pkt,
- vmci_transport_notify_pkt_recv_init,
- vmci_transport_notify_pkt_recv_pre_block,
- vmci_transport_notify_pkt_recv_pre_dequeue,
- vmci_transport_notify_pkt_recv_post_dequeue,
- vmci_transport_notify_pkt_send_init,
- vmci_transport_notify_pkt_send_pre_block,
- vmci_transport_notify_pkt_send_pre_enqueue,
- vmci_transport_notify_pkt_send_post_enqueue,
- vmci_transport_notify_pkt_process_request,
- vmci_transport_notify_pkt_process_negotiate,
+ .socket_init = vmci_transport_notify_pkt_socket_init,
+ .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
+ .poll_in = vmci_transport_notify_pkt_poll_in,
+ .poll_out = vmci_transport_notify_pkt_poll_out,
+ .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
+ .recv_init = vmci_transport_notify_pkt_recv_init,
+ .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
+ .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
+ .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
+ .send_init = vmci_transport_notify_pkt_send_init,
+ .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
+ .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
+ .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
+ .process_request = vmci_transport_notify_pkt_process_request,
+ .process_negotiate = vmci_transport_notify_pkt_process_negotiate,
};
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
index 21e591dafb03..f3a0afc46208 100644
--- a/net/vmw_vsock/vmci_transport_notify_qstate.c
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -420,19 +420,19 @@ vmci_transport_notify_pkt_send_pre_enqueue(
/* Socket always on control packet based operations. */
const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
- vmci_transport_notify_pkt_socket_init,
- vmci_transport_notify_pkt_socket_destruct,
- vmci_transport_notify_pkt_poll_in,
- vmci_transport_notify_pkt_poll_out,
- vmci_transport_notify_pkt_handle_pkt,
- vmci_transport_notify_pkt_recv_init,
- vmci_transport_notify_pkt_recv_pre_block,
- vmci_transport_notify_pkt_recv_pre_dequeue,
- vmci_transport_notify_pkt_recv_post_dequeue,
- vmci_transport_notify_pkt_send_init,
- vmci_transport_notify_pkt_send_pre_block,
- vmci_transport_notify_pkt_send_pre_enqueue,
- vmci_transport_notify_pkt_send_post_enqueue,
- vmci_transport_notify_pkt_process_request,
- vmci_transport_notify_pkt_process_negotiate,
+ .socket_init = vmci_transport_notify_pkt_socket_init,
+ .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
+ .poll_in = vmci_transport_notify_pkt_poll_in,
+ .poll_out = vmci_transport_notify_pkt_poll_out,
+ .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
+ .recv_init = vmci_transport_notify_pkt_recv_init,
+ .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
+ .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
+ .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
+ .send_init = vmci_transport_notify_pkt_send_init,
+ .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
+ .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
+ .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
+ .process_request = vmci_transport_notify_pkt_process_request,
+ .process_negotiate = vmci_transport_notify_pkt_process_negotiate,
};
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 3f816e2971ee..5db731512014 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -572,16 +572,20 @@ struct d_level D_LEVEL[] = {
size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-struct genl_family wimax_gnl_family = {
- .id = GENL_ID_GENERATE,
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+ { .name = "msg", },
+};
+
+struct genl_family wimax_gnl_family __ro_after_init = {
.name = "WiMAX",
.version = WIMAX_GNL_VERSION,
.hdrsize = 0,
.maxattr = WIMAX_GNL_ATTR_MAX,
-};
-
-static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
- { .name = "msg", },
+ .module = THIS_MODULE,
+ .ops = wimax_gnl_ops,
+ .n_ops = ARRAY_SIZE(wimax_gnl_ops),
+ .mcgrps = wimax_gnl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
};
@@ -596,11 +600,7 @@ int __init wimax_subsys_init(void)
d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
"wimax.debug");
- snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
- "WiMAX");
- result = genl_register_family_with_ops_groups(&wimax_gnl_family,
- wimax_gnl_ops,
- wimax_gnl_mcgrps);
+ result = genl_register_family(&wimax_gnl_family);
if (unlikely(result < 0)) {
pr_err("cannot register generic netlink family: %d\n", result);
goto error_register_family;
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 4c9e39f04ef8..816c9331c8d2 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -17,8 +17,6 @@ cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
CFLAGS_trace.o := -I$(src)
-ccflags-y += -D__CHECK_ENDIAN__
-
$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk
@$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 0f506220a3bd..5497d022fada 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -372,6 +372,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
break;
case NL80211_IFTYPE_UNSPECIFIED:
case NUM_NL80211_IFTYPES:
@@ -946,6 +947,7 @@ cfg80211_get_chan_state(struct wireless_dev *wdev,
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
/* these interface types don't really have a channel */
return;
case NL80211_IFTYPE_UNSPECIFIED:
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 7645e97362c0..158c59ecf90a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -210,11 +210,11 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE))
return;
- if (!wdev->p2p_started)
+ if (!wdev_running(wdev))
return;
rdev_stop_p2p_device(rdev, wdev);
- wdev->p2p_started = false;
+ wdev->is_running = false;
rdev->opencount--;
@@ -225,6 +225,23 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
}
}
+void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ ASSERT_RTNL();
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN))
+ return;
+
+ if (!wdev_running(wdev))
+ return;
+
+ rdev_stop_nan(rdev, wdev);
+ wdev->is_running = false;
+
+ rdev->opencount--;
+}
+
void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
@@ -242,6 +259,9 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
case NL80211_IFTYPE_P2P_DEVICE:
cfg80211_stop_p2p_device(rdev, wdev);
break;
+ case NL80211_IFTYPE_NAN:
+ cfg80211_stop_nan(rdev, wdev);
+ break;
default:
break;
}
@@ -537,6 +557,26 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
c->limits[j].max > 1))
return -EINVAL;
+ /* Only a single NAN can be allowed */
+ if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) &&
+ c->limits[j].max > 1))
+ return -EINVAL;
+
+ /*
+ * This isn't well-defined right now. If you have an
+ * IBSS interface, then its beacon interval may change
+ * by joining other networks, and nothing prevents it
+ * from doing that.
+ * So technically we probably shouldn't even allow AP
+ * and IBSS in the same interface, but it seems that
+ * some drivers support that, possibly only with fixed
+ * beacon intervals for IBSS.
+ */
+ if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) &&
+ c->beacon_int_min_gcd)) {
+ return -EINVAL;
+ }
+
cnt += c->limits[j].max;
/*
* Don't advertise an unsupported type
@@ -546,6 +586,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
return -EINVAL;
}
+#ifndef CONFIG_WIRELESS_WDS
+ if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS)))
+ return -EINVAL;
+#endif
+
/* You can't even choose that many! */
if (WARN_ON(cnt < c->max_interfaces))
return -EINVAL;
@@ -579,6 +624,16 @@ int wiphy_register(struct wiphy *wiphy)
!rdev->ops->tdls_cancel_channel_switch)))
return -EINVAL;
+ if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
+ (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
+ !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
+ return -EINVAL;
+
+#ifndef CONFIG_WIRELESS_WDS
+ if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)))
+ return -EINVAL;
+#endif
+
/*
* if a wiphy has unsupported modes for regulatory channel enforcement,
* opt-out of enforcement checking
@@ -589,6 +644,7 @@ int wiphy_register(struct wiphy *wiphy)
BIT(NL80211_IFTYPE_P2P_GO) |
BIT(NL80211_IFTYPE_ADHOC) |
BIT(NL80211_IFTYPE_P2P_DEVICE) |
+ BIT(NL80211_IFTYPE_NAN) |
BIT(NL80211_IFTYPE_AP_VLAN) |
BIT(NL80211_IFTYPE_MONITOR)))
wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF;
@@ -906,6 +962,8 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
if (WARN_ON(wdev->netdev))
return;
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
+
list_del_rcu(&wdev->list);
rdev->devlist_generation++;
@@ -914,6 +972,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
cfg80211_mlme_purge_registrations(wdev);
cfg80211_stop_p2p_device(rdev, wdev);
break;
+ case NL80211_IFTYPE_NAN:
+ cfg80211_stop_nan(rdev, wdev);
+ break;
default:
WARN_ON_ONCE(1);
break;
@@ -977,6 +1038,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
/* must be handled by mac80211/driver, has no APIs */
break;
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
/* cannot happen, has no netdev */
break;
case NL80211_IFTYPE_AP_VLAN:
@@ -1079,6 +1141,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
dev->priv_flags |= IFF_DONT_BRIDGE;
+
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
break;
case NETDEV_GOING_DOWN:
cfg80211_leave(rdev, wdev);
@@ -1157,6 +1221,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
* remove and clean it up.
*/
if (!list_empty(&wdev->list)) {
+ nl80211_notify_iface(rdev, wdev,
+ NL80211_CMD_DEL_INTERFACE);
sysfs_remove_link(&dev->dev.kobj, "phy80211");
list_del_rcu(&wdev->list);
rdev->devlist_generation++;
@@ -1246,7 +1312,7 @@ static int __init cfg80211_init(void)
if (err)
goto out_fail_reg;
- cfg80211_wq = create_singlethread_workqueue("cfg80211");
+ cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM);
if (!cfg80211_wq) {
err = -ENOMEM;
goto out_fail_wq;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index eee91443924d..af6e023020b1 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -71,6 +71,7 @@ struct cfg80211_registered_device {
struct list_head bss_list;
struct rb_root bss_tree;
u32 bss_generation;
+ u32 bss_entries;
struct cfg80211_scan_request *scan_req; /* protected by RTNL */
struct sk_buff *scan_msg;
struct cfg80211_sched_scan_request __rcu *sched_scan_req;
@@ -249,9 +250,9 @@ struct cfg80211_event {
};
struct cfg80211_cached_keys {
- struct key_params params[6];
- u8 data[6][WLAN_MAX_KEY_LEN];
- int def, defmgmt;
+ struct key_params params[CFG80211_MAX_WEP_KEYS];
+ u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104];
+ int def;
};
enum cfg80211_chan_mode {
@@ -345,7 +346,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
const u8 *ssid, int ssid_len,
const u8 *ie, int ie_len,
const u8 *key, int key_len, int key_idx,
- const u8 *sae_data, int sae_data_len);
+ const u8 *auth_data, int auth_data_len);
int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
struct net_device *dev,
struct ieee80211_channel *chan,
@@ -409,6 +410,7 @@ void cfg80211_sme_disassoc(struct wireless_dev *wdev);
void cfg80211_sme_deauth(struct wireless_dev *wdev);
void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
+void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);
/* internal helpers */
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
@@ -475,7 +477,7 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
u32 *mask);
int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
- u32 beacon_int);
+ enum nl80211_iftype iftype, u32 beacon_int);
void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
enum nl80211_iftype iftype, int num);
@@ -488,6 +490,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev,
void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
+void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+
#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10
#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 4a4dda53bdf1..364f900a3dc4 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -43,7 +43,8 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
cfg80211_hold_bss(bss_from_pub(bss));
wdev->current_bss = bss_from_pub(bss);
- cfg80211_upload_connect_keys(wdev);
+ if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
+ cfg80211_upload_connect_keys(wdev);
nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid,
GFP_KERNEL);
@@ -114,6 +115,9 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
}
}
+ if (WARN_ON(connkeys && connkeys->def < 0))
+ return -EINVAL;
+
if (WARN_ON(wdev->connect_keys))
kzfree(wdev->connect_keys);
wdev->connect_keys = connkeys;
@@ -284,18 +288,16 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
if (!netif_running(wdev->netdev))
return 0;
- if (wdev->wext.keys) {
+ if (wdev->wext.keys)
wdev->wext.keys->def = wdev->wext.default_key;
- wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
- }
wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
- if (wdev->wext.keys) {
+ if (wdev->wext.keys && wdev->wext.keys->def != -1) {
ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
if (!ck)
return -ENOMEM;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++)
ck->params[i].key = ck->data[i];
}
err = __cfg80211_join_ibss(rdev, wdev->netdev,
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 71447cf86306..ba0a1f398ce5 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -556,7 +556,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
break;
- case 0:
+ default:
memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
break;
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index fa2066b56f36..2d8518a37eab 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -183,6 +183,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
wdev->mesh_id_len = setup->mesh_id_len;
wdev->chandef = setup->chandef;
+ wdev->beacon_interval = setup->beacon_interval;
}
return err;
@@ -258,6 +259,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
err = rdev_leave_mesh(rdev, dev);
if (!err) {
wdev->mesh_id_len = 0;
+ wdev->beacon_interval = 0;
memset(&wdev->chandef, 0, sizeof(wdev->chandef));
rdev_set_qos_map(rdev, dev, NULL);
}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index c284d883c349..4646cf5695b9 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -149,6 +149,18 @@ void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss)
}
EXPORT_SYMBOL(cfg80211_assoc_timeout);
+void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+
+ cfg80211_sme_abandon_assoc(wdev);
+
+ cfg80211_unhold_bss(bss_from_pub(bss));
+ cfg80211_put_bss(wiphy, bss);
+}
+EXPORT_SYMBOL(cfg80211_abandon_assoc);
+
void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -204,14 +216,14 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
const u8 *ssid, int ssid_len,
const u8 *ie, int ie_len,
const u8 *key, int key_len, int key_idx,
- const u8 *sae_data, int sae_data_len)
+ const u8 *auth_data, int auth_data_len)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_auth_request req = {
.ie = ie,
.ie_len = ie_len,
- .sae_data = sae_data,
- .sae_data_len = sae_data_len,
+ .auth_data = auth_data,
+ .auth_data_len = auth_data_len,
.auth_type = auth_type,
.key = key,
.key_len = key_len,
@@ -222,7 +234,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
ASSERT_WDEV_LOCK(wdev);
if (auth_type == NL80211_AUTHTYPE_SHARED_KEY)
- if (!key || !key_len || key_idx < 0 || key_idx > 4)
+ if (!key || !key_len || key_idx < 0 || key_idx > 3)
return -EINVAL;
if (wdev->current_bss &&
@@ -634,6 +646,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
* fall through, P2P device only supports
* public action frames
*/
+ case NL80211_IFTYPE_NAN:
default:
err = -EOPNOTSUPP;
break;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4809f4d2cdcc..aee396b9f190 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -32,22 +32,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
struct cfg80211_crypto_settings *settings,
int cipher_limit);
-static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-
/* the netlink family */
-static struct genl_family nl80211_fam = {
- .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
- .name = NL80211_GENL_NAME, /* have users key off the name instead */
- .hdrsize = 0, /* no private header */
- .version = 1, /* no particular meaning now */
- .maxattr = NL80211_ATTR_MAX,
- .netnsok = true,
- .pre_doit = nl80211_pre_doit,
- .post_doit = nl80211_post_doit,
-};
+static struct genl_family nl80211_fam;
/* multicast groups */
enum nl80211_multicast_groups {
@@ -56,6 +42,7 @@ enum nl80211_multicast_groups {
NL80211_MCGRP_REGULATORY,
NL80211_MCGRP_MLME,
NL80211_MCGRP_VENDOR,
+ NL80211_MCGRP_NAN,
NL80211_MCGRP_TESTMODE /* keep last - ifdef! */
};
@@ -65,6 +52,7 @@ static const struct genl_multicast_group nl80211_mcgrps[] = {
[NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG },
[NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME },
[NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR },
+ [NL80211_MCGRP_NAN] = { .name = NL80211_MULTICAST_GROUP_NAN },
#ifdef CONFIG_NL80211_TESTMODE
[NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE }
#endif
@@ -355,7 +343,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
[NL80211_ATTR_WDEV] = { .type = NLA_U64 },
[NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
- [NL80211_ATTR_SAE_DATA] = { .type = NLA_BINARY, },
+ [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
[NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN },
[NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
[NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 },
@@ -409,6 +397,14 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
.len = VHT_MUMIMO_GROUPS_DATA_LEN
},
[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
+ [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
+ [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
+ [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
+ [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
+ .len = FILS_MAX_KEK_LEN },
+ [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
+ [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
+ [NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
};
/* policy for the key attributes */
@@ -430,6 +426,7 @@ nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = {
[NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG },
};
+#ifdef CONFIG_PM
/* policy for WoWLAN attributes */
static const struct nla_policy
nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
@@ -463,6 +460,7 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
[NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .len = 1 },
[NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 },
};
+#endif /* CONFIG_PM */
/* policy for coalesce rule attributes */
static const struct nla_policy
@@ -502,6 +500,39 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
},
};
+/* policy for NAN function attributes */
+static const struct nla_policy
+nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
+ [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_SERVICE_ID] = { .type = NLA_BINARY,
+ .len = NL80211_NAN_FUNC_SERVICE_ID_LEN },
+ [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG },
+ [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG },
+ [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { .len = ETH_ALEN },
+ [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG },
+ [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 },
+ [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY,
+ .len = NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN },
+ [NL80211_NAN_FUNC_SRF] = { .type = NLA_NESTED },
+ [NL80211_NAN_FUNC_RX_MATCH_FILTER] = { .type = NLA_NESTED },
+ [NL80211_NAN_FUNC_TX_MATCH_FILTER] = { .type = NLA_NESTED },
+ [NL80211_NAN_FUNC_INSTANCE_ID] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_TERM_REASON] = { .type = NLA_U8 },
+};
+
+/* policy for Service Response Filter attributes */
+static const struct nla_policy
+nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = {
+ [NL80211_NAN_SRF_INCLUDE] = { .type = NLA_FLAG },
+ [NL80211_NAN_SRF_BF] = { .type = NLA_BINARY,
+ .len = NL80211_NAN_FUNC_SRF_MAX_LEN },
+ [NL80211_NAN_SRF_BF_IDX] = { .type = NLA_U8 },
+ [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED },
+};
+
static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct cfg80211_registered_device **rdev,
@@ -513,13 +544,14 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
if (!cb->args[0]) {
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
- nl80211_fam.attrbuf, nl80211_fam.maxattr,
- nl80211_policy);
+ genl_family_attrbuf(&nl80211_fam),
+ nl80211_fam.maxattr, nl80211_policy);
if (err)
goto out_unlock;
- *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
+ *wdev = __cfg80211_wdev_from_attrs(
+ sock_net(skb->sk),
+ genl_family_attrbuf(&nl80211_fam));
if (IS_ERR(*wdev)) {
err = PTR_ERR(*wdev);
goto out_unlock;
@@ -848,13 +880,21 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
struct nlattr *key;
struct cfg80211_cached_keys *result;
int rem, err, def = 0;
+ bool have_key = false;
+
+ nla_for_each_nested(key, keys, rem) {
+ have_key = true;
+ break;
+ }
+
+ if (!have_key)
+ return NULL;
result = kzalloc(sizeof(*result), GFP_KERNEL);
if (!result)
return ERR_PTR(-ENOMEM);
result->def = -1;
- result->defmgmt = -1;
nla_for_each_nested(key, keys, rem) {
memset(&parse, 0, sizeof(parse));
@@ -866,7 +906,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
err = -EINVAL;
if (!parse.p.key)
goto error;
- if (parse.idx < 0 || parse.idx > 4)
+ if (parse.idx < 0 || parse.idx > 3)
goto error;
if (parse.def) {
if (def)
@@ -881,16 +921,24 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
parse.idx, false, NULL);
if (err)
goto error;
+ if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+ parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) {
+ err = -EINVAL;
+ goto error;
+ }
result->params[parse.idx].cipher = parse.p.cipher;
result->params[parse.idx].key_len = parse.p.key_len;
result->params[parse.idx].key = result->data[parse.idx];
memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);
- if (parse.p.cipher == WLAN_CIPHER_SUITE_WEP40 ||
- parse.p.cipher == WLAN_CIPHER_SUITE_WEP104) {
- if (no_ht)
- *no_ht = true;
- }
+ /* must be WEP key if we got here */
+ if (no_ht)
+ *no_ht = true;
+ }
+
+ if (result->def < 0) {
+ err = -EINVAL;
+ goto error;
}
return result;
@@ -918,6 +966,7 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
case NL80211_IFTYPE_UNSPECIFIED:
case NL80211_IFTYPE_OCB:
case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_NAN:
case NL80211_IFTYPE_P2P_DEVICE:
case NL80211_IFTYPE_WDS:
case NUM_NL80211_IFTYPES:
@@ -1020,6 +1069,10 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy,
nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS,
c->radar_detect_regions)))
goto nla_put_failure;
+ if (c->beacon_int_min_gcd &&
+ nla_put_u32(msg, NL80211_IFACE_COMB_BI_MIN_GCD,
+ c->beacon_int_min_gcd))
+ goto nla_put_failure;
nla_nest_end(msg, nl_combi);
}
@@ -1267,6 +1320,95 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg,
return 0;
}
+#define CMD(op, n) \
+ do { \
+ if (rdev->ops->op) { \
+ i++; \
+ if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
+ goto nla_put_failure; \
+ } \
+ } while (0)
+
+static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ int i = 0;
+
+ /*
+ * do *NOT* add anything into this function, new things need to be
+ * advertised only to new versions of userspace that can deal with
+ * the split (and they can't possibly care about new features...
+ */
+ CMD(add_virtual_intf, NEW_INTERFACE);
+ CMD(change_virtual_intf, SET_INTERFACE);
+ CMD(add_key, NEW_KEY);
+ CMD(start_ap, START_AP);
+ CMD(add_station, NEW_STATION);
+ CMD(add_mpath, NEW_MPATH);
+ CMD(update_mesh_config, SET_MESH_CONFIG);
+ CMD(change_bss, SET_BSS);
+ CMD(auth, AUTHENTICATE);
+ CMD(assoc, ASSOCIATE);
+ CMD(deauth, DEAUTHENTICATE);
+ CMD(disassoc, DISASSOCIATE);
+ CMD(join_ibss, JOIN_IBSS);
+ CMD(join_mesh, JOIN_MESH);
+ CMD(set_pmksa, SET_PMKSA);
+ CMD(del_pmksa, DEL_PMKSA);
+ CMD(flush_pmksa, FLUSH_PMKSA);
+ if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
+ CMD(remain_on_channel, REMAIN_ON_CHANNEL);
+ CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
+ CMD(mgmt_tx, FRAME);
+ CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
+ if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
+ goto nla_put_failure;
+ }
+ if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
+ rdev->ops->join_mesh) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
+ goto nla_put_failure;
+ }
+ CMD(set_wds_peer, SET_WDS_PEER);
+ if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
+ CMD(tdls_mgmt, TDLS_MGMT);
+ CMD(tdls_oper, TDLS_OPER);
+ }
+ if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ CMD(sched_scan_start, START_SCHED_SCAN);
+ CMD(probe_client, PROBE_CLIENT);
+ CMD(set_noack_map, SET_NOACK_MAP);
+ if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
+ goto nla_put_failure;
+ }
+ CMD(start_p2p_device, START_P2P_DEVICE);
+ CMD(set_mcast_rate, SET_MCAST_RATE);
+#ifdef CONFIG_NL80211_TESTMODE
+ CMD(testmode_cmd, TESTMODE);
+#endif
+
+ if (rdev->ops->connect || rdev->ops->auth) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
+ goto nla_put_failure;
+ }
+
+ if (rdev->ops->disconnect || rdev->ops->deauth) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
+ goto nla_put_failure;
+ }
+
+ return i;
+ nla_put_failure:
+ return -ENOBUFS;
+}
+
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
@@ -1494,68 +1636,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
if (!nl_cmds)
goto nla_put_failure;
- i = 0;
-#define CMD(op, n) \
- do { \
- if (rdev->ops->op) { \
- i++; \
- if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
- goto nla_put_failure; \
- } \
- } while (0)
-
- CMD(add_virtual_intf, NEW_INTERFACE);
- CMD(change_virtual_intf, SET_INTERFACE);
- CMD(add_key, NEW_KEY);
- CMD(start_ap, START_AP);
- CMD(add_station, NEW_STATION);
- CMD(add_mpath, NEW_MPATH);
- CMD(update_mesh_config, SET_MESH_CONFIG);
- CMD(change_bss, SET_BSS);
- CMD(auth, AUTHENTICATE);
- CMD(assoc, ASSOCIATE);
- CMD(deauth, DEAUTHENTICATE);
- CMD(disassoc, DISASSOCIATE);
- CMD(join_ibss, JOIN_IBSS);
- CMD(join_mesh, JOIN_MESH);
- CMD(set_pmksa, SET_PMKSA);
- CMD(del_pmksa, DEL_PMKSA);
- CMD(flush_pmksa, FLUSH_PMKSA);
- if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
- CMD(remain_on_channel, REMAIN_ON_CHANNEL);
- CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
- CMD(mgmt_tx, FRAME);
- CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
- if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
- goto nla_put_failure;
- }
- if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
- rdev->ops->join_mesh) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
- goto nla_put_failure;
- }
- CMD(set_wds_peer, SET_WDS_PEER);
- if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
- CMD(tdls_mgmt, TDLS_MGMT);
- CMD(tdls_oper, TDLS_OPER);
- }
- if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
- CMD(sched_scan_start, START_SCHED_SCAN);
- CMD(probe_client, PROBE_CLIENT);
- CMD(set_noack_map, SET_NOACK_MAP);
- if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
- goto nla_put_failure;
- }
- CMD(start_p2p_device, START_P2P_DEVICE);
- CMD(set_mcast_rate, SET_MCAST_RATE);
-#ifdef CONFIG_NL80211_TESTMODE
- CMD(testmode_cmd, TESTMODE);
-#endif
+ i = nl80211_add_commands_unsplit(rdev, msg);
+ if (i < 0)
+ goto nla_put_failure;
if (state->split) {
CMD(crit_proto_start, CRIT_PROTOCOL_START);
CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
@@ -1565,22 +1648,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
if (rdev->wiphy.features &
NL80211_FEATURE_SUPPORTS_WMM_ADMISSION)
CMD(add_tx_ts, ADD_TX_TS);
+ CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST);
+ CMD(update_connect_params, UPDATE_CONNECT_PARAMS);
}
- /* add into the if now */
#undef CMD
- if (rdev->ops->connect || rdev->ops->auth) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
- goto nla_put_failure;
- }
-
- if (rdev->ops->disconnect || rdev->ops->deauth) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
- goto nla_put_failure;
- }
-
nla_nest_end(msg, nl_cmds);
state->split_start++;
if (state->split)
@@ -1826,7 +1898,7 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl80211_dump_wiphy_state *state)
{
- struct nlattr **tb = nl80211_fam.attrbuf;
+ struct nlattr **tb = genl_family_attrbuf(&nl80211_fam);
int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
tb, nl80211_fam.maxattr, nl80211_policy);
/* ignore parse errors for backward compatibility */
@@ -2241,10 +2313,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
nla_for_each_nested(nl_txq_params,
info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
rem_txq_params) {
- result = nla_parse(tb, NL80211_TXQ_ATTR_MAX,
- nla_data(nl_txq_params),
- nla_len(nl_txq_params),
- txq_params_policy);
+ result = nla_parse_nested(tb, NL80211_TXQ_ATTR_MAX,
+ nl_txq_params,
+ txq_params_policy);
if (result)
return result;
result = parse_txq_params(tb, &txq_params);
@@ -2525,10 +2596,35 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
int if_idx = 0;
int wp_start = cb->args[0];
int if_start = cb->args[1];
+ int filter_wiphy = -1;
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
rtnl_lock();
+ if (!cb->args[2]) {
+ struct nl80211_dump_wiphy_state state = {
+ .filter_wiphy = -1,
+ };
+ int ret;
+
+ ret = nl80211_dump_wiphy_parse(skb, cb, &state);
+ if (ret)
+ return ret;
+
+ filter_wiphy = state.filter_wiphy;
+
+ /*
+ * if filtering, set cb->args[2] to +1 since 0 is the default
+ * value needed to determine that parsing is necessary.
+ */
+ if (filter_wiphy >= 0)
+ cb->args[2] = filter_wiphy + 1;
+ else
+ cb->args[2] = -1;
+ } else if (cb->args[2] > 0) {
+ filter_wiphy = cb->args[2] - 1;
+ }
+
list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
continue;
@@ -2536,6 +2632,10 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
wp_idx++;
continue;
}
+
+ if (filter_wiphy >= 0 && filter_wiphy != rdev->wiphy_idx)
+ continue;
+
if_idx = 0;
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
@@ -2751,7 +2851,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct vif_params params;
struct wireless_dev *wdev;
- struct sk_buff *msg, *event;
+ struct sk_buff *msg;
int err;
enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
u32 flags;
@@ -2774,7 +2874,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
!(rdev->wiphy.interface_modes & (1 << type)))
return -EOPNOTSUPP;
- if ((type == NL80211_IFTYPE_P2P_DEVICE ||
+ if ((type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN ||
rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) &&
info->attrs[NL80211_ATTR_MAC]) {
nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC],
@@ -2830,9 +2930,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
wdev->mesh_id_up_len);
wdev_unlock(wdev);
break;
+ case NL80211_IFTYPE_NAN:
case NL80211_IFTYPE_P2P_DEVICE:
/*
- * P2P Device doesn't have a netdev, so doesn't go
+ * P2P Device and NAN do not have a netdev, so don't go
* through the netdev notifier and must be added here
*/
mutex_init(&wdev->mtx);
@@ -2855,20 +2956,15 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
return -ENOBUFS;
}
- event = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (event) {
- if (nl80211_send_iface(event, 0, 0, 0,
- rdev, wdev, false) < 0) {
- nlmsg_free(event);
- goto out;
- }
-
- genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
- event, 0, NL80211_MCGRP_CONFIG,
- GFP_KERNEL);
- }
+ /*
+ * For wdevs which have no associated netdev object (e.g. of type
+ * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here.
+ * For all other types, the event will be generated from the
+ * netdev notifier
+ */
+ if (!wdev->netdev)
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
-out:
return genlmsg_reply(msg, info);
}
@@ -2876,18 +2972,10 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct wireless_dev *wdev = info->user_ptr[1];
- struct sk_buff *msg;
- int status;
if (!rdev->ops->del_virtual_intf)
return -EOPNOTSUPP;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (msg && nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, true) < 0) {
- nlmsg_free(msg);
- msg = NULL;
- }
-
/*
* If we remove a wireless device without a netdev then clear
* user_ptr[1] so that nl80211_post_doit won't dereference it
@@ -2898,15 +2986,7 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
if (!wdev->netdev)
info->user_ptr[1] = NULL;
- status = rdev_del_virtual_intf(rdev, wdev);
- if (status >= 0 && msg)
- genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
- msg, 0, NL80211_MCGRP_CONFIG,
- GFP_KERNEL);
- else
- nlmsg_free(msg);
-
- return status;
+ return rdev_del_virtual_intf(rdev, wdev);
}
static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info)
@@ -3316,6 +3396,291 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
+ u8 *rates, u8 rates_len)
+{
+ u8 i;
+ u32 mask = 0;
+
+ for (i = 0; i < rates_len; i++) {
+ int rate = (rates[i] & 0x7f) * 5;
+ int ridx;
+
+ for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
+ struct ieee80211_rate *srate =
+ &sband->bitrates[ridx];
+ if (rate == srate->bitrate) {
+ mask |= 1 << ridx;
+ break;
+ }
+ }
+ if (ridx == sband->n_bitrates)
+ return 0; /* rate not found */
+ }
+
+ return mask;
+}
+
+static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
+ u8 *rates, u8 rates_len,
+ u8 mcs[IEEE80211_HT_MCS_MASK_LEN])
+{
+ u8 i;
+
+ memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN);
+
+ for (i = 0; i < rates_len; i++) {
+ int ridx, rbit;
+
+ ridx = rates[i] / 8;
+ rbit = BIT(rates[i] % 8);
+
+ /* check validity */
+ if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN))
+ return false;
+
+ /* check availability */
+ if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
+ mcs[ridx] |= rbit;
+ else
+ return false;
+ }
+
+ return true;
+}
+
+static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map)
+{
+ u16 mcs_mask = 0;
+
+ switch (vht_mcs_map) {
+ case IEEE80211_VHT_MCS_NOT_SUPPORTED:
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_7:
+ mcs_mask = 0x00FF;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_8:
+ mcs_mask = 0x01FF;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_9:
+ mcs_mask = 0x03FF;
+ break;
+ default:
+ break;
+ }
+
+ return mcs_mask;
+}
+
+static void vht_build_mcs_mask(u16 vht_mcs_map,
+ u16 vht_mcs_mask[NL80211_VHT_NSS_MAX])
+{
+ u8 nss;
+
+ for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) {
+ vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03);
+ vht_mcs_map >>= 2;
+ }
+}
+
+static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
+ struct nl80211_txrate_vht *txrate,
+ u16 mcs[NL80211_VHT_NSS_MAX])
+{
+ u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
+ u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {};
+ u8 i;
+
+ if (!sband->vht_cap.vht_supported)
+ return false;
+
+ memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX);
+
+ /* Build vht_mcs_mask from VHT capabilities */
+ vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
+ if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
+ mcs[i] = txrate->mcs[i];
+ else
+ return false;
+ }
+
+ return true;
+}
+
+static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
+ [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
+ .len = NL80211_MAX_SUPP_RATES },
+ [NL80211_TXRATE_HT] = { .type = NLA_BINARY,
+ .len = NL80211_MAX_SUPP_HT_RATES },
+ [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)},
+ [NL80211_TXRATE_GI] = { .type = NLA_U8 },
+};
+
+static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
+ struct cfg80211_bitrate_mask *mask)
+{
+ struct nlattr *tb[NL80211_TXRATE_MAX + 1];
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int rem, i;
+ struct nlattr *tx_rates;
+ struct ieee80211_supported_band *sband;
+ u16 vht_tx_mcs_map;
+
+ memset(mask, 0, sizeof(*mask));
+ /* Default to all rates enabled */
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ sband = rdev->wiphy.bands[i];
+
+ if (!sband)
+ continue;
+
+ mask->control[i].legacy = (1 << sband->n_bitrates) - 1;
+ memcpy(mask->control[i].ht_mcs,
+ sband->ht_cap.mcs.rx_mask,
+ sizeof(mask->control[i].ht_mcs));
+
+ if (!sband->vht_cap.vht_supported)
+ continue;
+
+ vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
+ vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs);
+ }
+
+ /* if no rates are given set it back to the defaults */
+ if (!info->attrs[NL80211_ATTR_TX_RATES])
+ goto out;
+
+ /* The nested attribute uses enum nl80211_band as the index. This maps
+ * directly to the enum nl80211_band values used in cfg80211.
+ */
+ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
+ nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
+ enum nl80211_band band = nla_type(tx_rates);
+ int err;
+
+ if (band < 0 || band >= NUM_NL80211_BANDS)
+ return -EINVAL;
+ sband = rdev->wiphy.bands[band];
+ if (sband == NULL)
+ return -EINVAL;
+ err = nla_parse_nested(tb, NL80211_TXRATE_MAX, tx_rates,
+ nl80211_txattr_policy);
+ if (err)
+ return err;
+ if (tb[NL80211_TXRATE_LEGACY]) {
+ mask->control[band].legacy = rateset_to_mask(
+ sband,
+ nla_data(tb[NL80211_TXRATE_LEGACY]),
+ nla_len(tb[NL80211_TXRATE_LEGACY]));
+ if ((mask->control[band].legacy == 0) &&
+ nla_len(tb[NL80211_TXRATE_LEGACY]))
+ return -EINVAL;
+ }
+ if (tb[NL80211_TXRATE_HT]) {
+ if (!ht_rateset_to_mask(
+ sband,
+ nla_data(tb[NL80211_TXRATE_HT]),
+ nla_len(tb[NL80211_TXRATE_HT]),
+ mask->control[band].ht_mcs))
+ return -EINVAL;
+ }
+ if (tb[NL80211_TXRATE_VHT]) {
+ if (!vht_set_mcs_mask(
+ sband,
+ nla_data(tb[NL80211_TXRATE_VHT]),
+ mask->control[band].vht_mcs))
+ return -EINVAL;
+ }
+ if (tb[NL80211_TXRATE_GI]) {
+ mask->control[band].gi =
+ nla_get_u8(tb[NL80211_TXRATE_GI]);
+ if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI)
+ return -EINVAL;
+ }
+
+ if (mask->control[band].legacy == 0) {
+ /* don't allow empty legacy rates if HT or VHT
+ * are not even supported.
+ */
+ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
+ rdev->wiphy.bands[band]->vht_cap.vht_supported))
+ return -EINVAL;
+
+ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
+ if (mask->control[band].ht_mcs[i])
+ goto out;
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
+ if (mask->control[band].vht_mcs[i])
+ goto out;
+
+ /* legacy and mcs rates may not be both empty */
+ return -EINVAL;
+ }
+ }
+
+out:
+ return 0;
+}
+
+static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
+ enum nl80211_band band,
+ struct cfg80211_bitrate_mask *beacon_rate)
+{
+ u32 count_ht, count_vht, i;
+ u32 rate = beacon_rate->control[band].legacy;
+
+ /* Allow only one rate */
+ if (hweight32(rate) > 1)
+ return -EINVAL;
+
+ count_ht = 0;
+ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
+ if (hweight8(beacon_rate->control[band].ht_mcs[i]) > 1) {
+ return -EINVAL;
+ } else if (beacon_rate->control[band].ht_mcs[i]) {
+ count_ht++;
+ if (count_ht > 1)
+ return -EINVAL;
+ }
+ if (count_ht && rate)
+ return -EINVAL;
+ }
+
+ count_vht = 0;
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
+ if (hweight16(beacon_rate->control[band].vht_mcs[i]) > 1) {
+ return -EINVAL;
+ } else if (beacon_rate->control[band].vht_mcs[i]) {
+ count_vht++;
+ if (count_vht > 1)
+ return -EINVAL;
+ }
+ if (count_vht && rate)
+ return -EINVAL;
+ }
+
+ if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht))
+ return -EINVAL;
+
+ if (rate &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_LEGACY))
+ return -EINVAL;
+ if (count_ht &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_HT))
+ return -EINVAL;
+ if (count_vht &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_VHT))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nl80211_parse_beacon(struct nlattr *attrs[],
struct cfg80211_beacon_data *bcn)
{
@@ -3407,12 +3772,23 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
auth_type == NL80211_AUTHTYPE_SAE)
return false;
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_FILS_STA) &&
+ (auth_type == NL80211_AUTHTYPE_FILS_SK ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+ auth_type == NL80211_AUTHTYPE_FILS_PK))
+ return false;
return true;
case NL80211_CMD_CONNECT:
case NL80211_CMD_START_AP:
/* SAE not supported yet */
if (auth_type == NL80211_AUTHTYPE_SAE)
return false;
+ /* FILS not supported yet */
+ if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+ auth_type == NL80211_AUTHTYPE_FILS_PK)
+ return false;
return true;
default:
return false;
@@ -3454,7 +3830,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.dtim_period =
nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
- err = cfg80211_validate_beacon_int(rdev, params.beacon_interval);
+ err = cfg80211_validate_beacon_int(rdev, dev->ieee80211_ptr->iftype,
+ params.beacon_interval);
if (err)
return err;
@@ -3545,6 +3922,17 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
wdev->iftype))
return -EINVAL;
+ if (info->attrs[NL80211_ATTR_TX_RATES]) {
+ err = nl80211_parse_tx_bitrate_mask(info, &params.beacon_rate);
+ if (err)
+ return err;
+
+ err = validate_beacon_tx_rate(rdev, params.chandef.chan->band,
+ &params.beacon_rate);
+ if (err)
+ return err;
+ }
+
if (info->attrs[NL80211_ATTR_SMPS_MODE]) {
params.smps_mode =
nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]);
@@ -4227,6 +4615,15 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
break;
}
+ /*
+ * Older kernel versions ignored this attribute entirely, so don't
+ * reject attempts to update it but mark it as unused instead so the
+ * driver won't look at the data.
+ */
+ if (statype != CFG80211_STA_AP_CLIENT_UNASSOC &&
+ statype != CFG80211_STA_TDLS_PEER_SETUP)
+ params->opmode_notif_used = false;
+
return 0;
}
EXPORT_SYMBOL(cfg80211_check_station_change);
@@ -4466,6 +4863,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
params.local_pm = pm;
}
+ if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
+ params.opmode_notif_used = true;
+ params.opmode_notif =
+ nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
+ }
+
/* Include parameters for TDLS peer (will check later) */
err = nl80211_set_station_tdls(info, &params);
if (err)
@@ -5374,6 +5777,18 @@ static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *ou
return 0;
}
+static int nl80211_check_power_mode(const struct nlattr *nla,
+ enum nl80211_mesh_power_mode min,
+ enum nl80211_mesh_power_mode max,
+ enum nl80211_mesh_power_mode *out)
+{
+ u32 val = nla_get_u32(nla);
+ if (val < min || val > max)
+ return -EINVAL;
+ *out = val;
+ return 0;
+}
+
static int nl80211_parse_mesh_config(struct genl_info *info,
struct mesh_config *cfg,
u32 *mask_out)
@@ -5501,6 +5916,7 @@ do { \
break;
}
cfg->ht_opmode = ht_opmode;
+ mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
}
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
1, 65535, mask,
@@ -5518,7 +5934,7 @@ do { \
NL80211_MESH_POWER_ACTIVE,
NL80211_MESH_POWER_MAX,
mask, NL80211_MESHCONF_POWER_MODE,
- nl80211_check_u32);
+ nl80211_check_power_mode);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration,
0, 65535, mask,
NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16);
@@ -5933,9 +6349,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
rem_reg_rules) {
- r = nla_parse(tb, NL80211_REG_RULE_ATTR_MAX,
- nla_data(nl_reg_rule), nla_len(nl_reg_rule),
- reg_rule_policy);
+ r = nla_parse_nested(tb, NL80211_REG_RULE_ATTR_MAX,
+ nl_reg_rule, reg_rule_policy);
if (r)
goto bad_reg;
r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
@@ -6002,8 +6417,8 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
if (!nla_ok(nest, nla_len(nest)))
return -EINVAL;
- err = nla_parse(attr, NL80211_BSS_SELECT_ATTR_MAX, nla_data(nest),
- nla_len(nest), nl80211_bss_select_policy);
+ err = nla_parse_nested(attr, NL80211_BSS_SELECT_ATTR_MAX, nest,
+ nl80211_bss_select_policy);
if (err)
return err;
@@ -6102,6 +6517,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
wiphy = &rdev->wiphy;
+ if (wdev->iftype == NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
if (!rdev->ops->scan)
return -EOPNOTSUPP;
@@ -6302,7 +6720,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
request->no_cck =
nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
- if (info->attrs[NL80211_ATTR_MAC])
+ /* Initial implementation used NL80211_ATTR_MAC to set the specific
+ * BSSID to scan for. This was problematic because that same attribute
+ * was already used for another purpose (local random MAC address). The
+ * NL80211_ATTR_BSSID attribute was added to fix this. For backwards
+ * compatibility with older userspace components, also use the
+ * NL80211_ATTR_MAC value here if it can be determined to be used for
+ * the specific BSSID use case instead of the random MAC address
+ * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use).
+ */
+ if (info->attrs[NL80211_ATTR_BSSID])
+ memcpy(request->bssid,
+ nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN);
+ else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) &&
+ info->attrs[NL80211_ATTR_MAC])
memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]),
ETH_ALEN);
else
@@ -6390,9 +6821,8 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
if (WARN_ON(i >= n_plans))
return -EINVAL;
- err = nla_parse(plan, NL80211_SCHED_SCAN_PLAN_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_plan_policy);
+ err = nla_parse_nested(plan, NL80211_SCHED_SCAN_PLAN_MAX,
+ attr, nl80211_plan_policy);
if (err)
return err;
@@ -6481,9 +6911,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
tmp) {
struct nlattr *rssi;
- err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_match_policy);
+ err = nla_parse_nested(tb,
+ NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+ attr, nl80211_match_policy);
if (err)
return ERR_PTR(err);
/* add other standalone attributes here */
@@ -6654,9 +7084,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
tmp) {
struct nlattr *ssid, *rssi;
- err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_match_policy);
+ err = nla_parse_nested(tb,
+ NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+ attr, nl80211_match_policy);
if (err)
goto out_free;
ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID];
@@ -7268,6 +7698,7 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
struct survey_info survey;
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
@@ -7280,7 +7711,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
return res;
/* prepare_wdev_dump parsed the attributes */
- radio_stats = nl80211_fam.attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
+ radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
if (!wdev->netdev) {
res = -EINVAL;
@@ -7333,8 +7764,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
struct ieee80211_channel *chan;
- const u8 *bssid, *ssid, *ie = NULL, *sae_data = NULL;
- int err, ssid_len, ie_len = 0, sae_data_len = 0;
+ const u8 *bssid, *ssid, *ie = NULL, *auth_data = NULL;
+ int err, ssid_len, ie_len = 0, auth_data_len = 0;
enum nl80211_auth_type auth_type;
struct key_parse key;
bool local_state_change;
@@ -7368,7 +7799,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
(key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
key.p.key_len != WLAN_KEY_LEN_WEP104))
return -EINVAL;
- if (key.idx > 4)
+ if (key.idx > 3)
return -EINVAL;
} else {
key.p.key_len = 0;
@@ -7414,17 +7845,23 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
return -EINVAL;
- if (auth_type == NL80211_AUTHTYPE_SAE &&
- !info->attrs[NL80211_ATTR_SAE_DATA])
+ if ((auth_type == NL80211_AUTHTYPE_SAE ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+ auth_type == NL80211_AUTHTYPE_FILS_PK) &&
+ !info->attrs[NL80211_ATTR_AUTH_DATA])
return -EINVAL;
- if (info->attrs[NL80211_ATTR_SAE_DATA]) {
- if (auth_type != NL80211_AUTHTYPE_SAE)
+ if (info->attrs[NL80211_ATTR_AUTH_DATA]) {
+ if (auth_type != NL80211_AUTHTYPE_SAE &&
+ auth_type != NL80211_AUTHTYPE_FILS_SK &&
+ auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
+ auth_type != NL80211_AUTHTYPE_FILS_PK)
return -EINVAL;
- sae_data = nla_data(info->attrs[NL80211_ATTR_SAE_DATA]);
- sae_data_len = nla_len(info->attrs[NL80211_ATTR_SAE_DATA]);
+ auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
+ auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
/* need to include at least Auth Transaction and Status Code */
- if (sae_data_len < 4)
+ if (auth_data_len < 4)
return -EINVAL;
}
@@ -7441,7 +7878,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
ssid, ssid_len, ie, ie_len,
key.p.key, key.p.key_len, key.idx,
- sae_data, sae_data_len);
+ auth_data, auth_data_len);
wdev_unlock(dev->ieee80211_ptr);
return err;
}
@@ -7620,6 +8057,15 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
req.flags |= ASSOC_REQ_USE_RRM;
}
+ if (info->attrs[NL80211_ATTR_FILS_KEK]) {
+ req.fils_kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]);
+ req.fils_kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]);
+ if (!info->attrs[NL80211_ATTR_FILS_NONCES])
+ return -EINVAL;
+ req.fils_nonces =
+ nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
+ }
+
err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
if (!err) {
wdev_lock(dev->ieee80211_ptr);
@@ -7773,12 +8219,14 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
ibss.beacon_interval = 100;
- if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
+ if (info->attrs[NL80211_ATTR_BEACON_INTERVAL])
ibss.beacon_interval =
nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
- if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000)
- return -EINVAL;
- }
+
+ err = cfg80211_validate_beacon_int(rdev, NL80211_IFTYPE_ADHOC,
+ ibss.beacon_interval);
+ if (err)
+ return err;
if (!rdev->ops->join_ibss)
return -EOPNOTSUPP;
@@ -7985,6 +8433,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
}
data = nla_nest_start(skb, attr);
+ if (!data)
+ goto nla_put_failure;
((void **)skb->cb)[0] = rdev;
((void **)skb->cb)[1] = hdr;
@@ -8100,14 +8550,14 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
*/
phy_idx = cb->args[0] - 1;
} else {
+ struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
+
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
- nl80211_fam.attrbuf, nl80211_fam.maxattr,
- nl80211_policy);
+ attrbuf, nl80211_fam.maxattr, nl80211_policy);
if (err)
goto out_err;
- rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
+ rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
if (IS_ERR(rdev)) {
err = PTR_ERR(rdev);
goto out_err;
@@ -8115,9 +8565,8 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
phy_idx = rdev->wiphy_idx;
rdev = NULL;
- if (nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA])
- cb->args[1] =
- (long)nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA];
+ if (attrbuf[NL80211_ATTR_TESTDATA])
+ cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
}
if (cb->args[1]) {
@@ -8348,6 +8797,37 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static int nl80211_update_connect_params(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_connect_params connect = {};
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ u32 changed = 0;
+ int ret;
+
+ if (!rdev->ops->update_connect_params)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+ connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ changed |= UPDATE_ASSOC_IES;
+ }
+
+ wdev_lock(dev->ieee80211_ptr);
+ if (!wdev->current_bss)
+ ret = -ENOLINK;
+ else
+ ret = rdev_update_connect_params(rdev, dev, &connect, changed);
+ wdev_unlock(dev->ieee80211_ptr);
+
+ return ret;
+}
+
static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -8602,238 +9082,21 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
return rdev_cancel_remain_on_channel(rdev, wdev, cookie);
}
-static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
- u8 *rates, u8 rates_len)
-{
- u8 i;
- u32 mask = 0;
-
- for (i = 0; i < rates_len; i++) {
- int rate = (rates[i] & 0x7f) * 5;
- int ridx;
-
- for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
- struct ieee80211_rate *srate =
- &sband->bitrates[ridx];
- if (rate == srate->bitrate) {
- mask |= 1 << ridx;
- break;
- }
- }
- if (ridx == sband->n_bitrates)
- return 0; /* rate not found */
- }
-
- return mask;
-}
-
-static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
- u8 *rates, u8 rates_len,
- u8 mcs[IEEE80211_HT_MCS_MASK_LEN])
-{
- u8 i;
-
- memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN);
-
- for (i = 0; i < rates_len; i++) {
- int ridx, rbit;
-
- ridx = rates[i] / 8;
- rbit = BIT(rates[i] % 8);
-
- /* check validity */
- if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN))
- return false;
-
- /* check availability */
- if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
- mcs[ridx] |= rbit;
- else
- return false;
- }
-
- return true;
-}
-
-static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map)
-{
- u16 mcs_mask = 0;
-
- switch (vht_mcs_map) {
- case IEEE80211_VHT_MCS_NOT_SUPPORTED:
- break;
- case IEEE80211_VHT_MCS_SUPPORT_0_7:
- mcs_mask = 0x00FF;
- break;
- case IEEE80211_VHT_MCS_SUPPORT_0_8:
- mcs_mask = 0x01FF;
- break;
- case IEEE80211_VHT_MCS_SUPPORT_0_9:
- mcs_mask = 0x03FF;
- break;
- default:
- break;
- }
-
- return mcs_mask;
-}
-
-static void vht_build_mcs_mask(u16 vht_mcs_map,
- u16 vht_mcs_mask[NL80211_VHT_NSS_MAX])
-{
- u8 nss;
-
- for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) {
- vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03);
- vht_mcs_map >>= 2;
- }
-}
-
-static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
- struct nl80211_txrate_vht *txrate,
- u16 mcs[NL80211_VHT_NSS_MAX])
-{
- u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
- u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {};
- u8 i;
-
- if (!sband->vht_cap.vht_supported)
- return false;
-
- memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX);
-
- /* Build vht_mcs_mask from VHT capabilities */
- vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
-
- for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
- if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
- mcs[i] = txrate->mcs[i];
- else
- return false;
- }
-
- return true;
-}
-
-static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
- [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
- .len = NL80211_MAX_SUPP_RATES },
- [NL80211_TXRATE_HT] = { .type = NLA_BINARY,
- .len = NL80211_MAX_SUPP_HT_RATES },
- [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)},
- [NL80211_TXRATE_GI] = { .type = NLA_U8 },
-};
-
static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
struct genl_info *info)
{
- struct nlattr *tb[NL80211_TXRATE_MAX + 1];
- struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct cfg80211_bitrate_mask mask;
- int rem, i;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
- struct nlattr *tx_rates;
- struct ieee80211_supported_band *sband;
- u16 vht_tx_mcs_map;
+ int err;
if (!rdev->ops->set_bitrate_mask)
return -EOPNOTSUPP;
- memset(&mask, 0, sizeof(mask));
- /* Default to all rates enabled */
- for (i = 0; i < NUM_NL80211_BANDS; i++) {
- sband = rdev->wiphy.bands[i];
-
- if (!sband)
- continue;
-
- mask.control[i].legacy = (1 << sband->n_bitrates) - 1;
- memcpy(mask.control[i].ht_mcs,
- sband->ht_cap.mcs.rx_mask,
- sizeof(mask.control[i].ht_mcs));
-
- if (!sband->vht_cap.vht_supported)
- continue;
-
- vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
- vht_build_mcs_mask(vht_tx_mcs_map, mask.control[i].vht_mcs);
- }
-
- /* if no rates are given set it back to the defaults */
- if (!info->attrs[NL80211_ATTR_TX_RATES])
- goto out;
-
- /*
- * The nested attribute uses enum nl80211_band as the index. This maps
- * directly to the enum nl80211_band values used in cfg80211.
- */
- BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
- nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
- enum nl80211_band band = nla_type(tx_rates);
- int err;
-
- if (band < 0 || band >= NUM_NL80211_BANDS)
- return -EINVAL;
- sband = rdev->wiphy.bands[band];
- if (sband == NULL)
- return -EINVAL;
- err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
- nla_len(tx_rates), nl80211_txattr_policy);
- if (err)
- return err;
- if (tb[NL80211_TXRATE_LEGACY]) {
- mask.control[band].legacy = rateset_to_mask(
- sband,
- nla_data(tb[NL80211_TXRATE_LEGACY]),
- nla_len(tb[NL80211_TXRATE_LEGACY]));
- if ((mask.control[band].legacy == 0) &&
- nla_len(tb[NL80211_TXRATE_LEGACY]))
- return -EINVAL;
- }
- if (tb[NL80211_TXRATE_HT]) {
- if (!ht_rateset_to_mask(
- sband,
- nla_data(tb[NL80211_TXRATE_HT]),
- nla_len(tb[NL80211_TXRATE_HT]),
- mask.control[band].ht_mcs))
- return -EINVAL;
- }
- if (tb[NL80211_TXRATE_VHT]) {
- if (!vht_set_mcs_mask(
- sband,
- nla_data(tb[NL80211_TXRATE_VHT]),
- mask.control[band].vht_mcs))
- return -EINVAL;
- }
- if (tb[NL80211_TXRATE_GI]) {
- mask.control[band].gi =
- nla_get_u8(tb[NL80211_TXRATE_GI]);
- if (mask.control[band].gi > NL80211_TXRATE_FORCE_LGI)
- return -EINVAL;
- }
-
- if (mask.control[band].legacy == 0) {
- /* don't allow empty legacy rates if HT or VHT
- * are not even supported.
- */
- if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
- rdev->wiphy.bands[band]->vht_cap.vht_supported))
- return -EINVAL;
-
- for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
- if (mask.control[band].ht_mcs[i])
- goto out;
-
- for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
- if (mask.control[band].vht_mcs[i])
- goto out;
-
- /* legacy and mcs rates may not be both empty */
- return -EINVAL;
- }
- }
+ err = nl80211_parse_tx_bitrate_mask(info, &mask);
+ if (err)
+ return err;
-out:
return rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
}
@@ -8859,6 +9122,7 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_P2P_DEVICE:
break;
+ case NL80211_IFTYPE_NAN:
default:
return -EOPNOTSUPP;
}
@@ -8904,6 +9168,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
case NL80211_IFTYPE_MESH_POINT:
case NL80211_IFTYPE_P2P_GO:
break;
+ case NL80211_IFTYPE_NAN:
default:
return -EOPNOTSUPP;
}
@@ -9020,6 +9285,7 @@ static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *in
case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_P2P_DEVICE:
break;
+ case NL80211_IFTYPE_NAN:
default:
return -EOPNOTSUPP;
}
@@ -9252,9 +9518,12 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
setup.beacon_interval =
nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
- if (setup.beacon_interval < 10 ||
- setup.beacon_interval > 10000)
- return -EINVAL;
+
+ err = cfg80211_validate_beacon_int(rdev,
+ NL80211_IFTYPE_MESH_POINT,
+ setup.beacon_interval);
+ if (err)
+ return err;
}
if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) {
@@ -9300,6 +9569,17 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
return err;
}
+ if (info->attrs[NL80211_ATTR_TX_RATES]) {
+ err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate);
+ if (err)
+ return err;
+
+ err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band,
+ &setup.beacon_rate);
+ if (err)
+ return err;
+ }
+
return cfg80211_join_mesh(rdev, dev, &setup, &cfg);
}
@@ -9413,18 +9693,27 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
if (!freqs)
return -ENOBUFS;
- for (i = 0; i < req->n_channels; i++)
- nla_put_u32(msg, i, req->channels[i]->center_freq);
+ for (i = 0; i < req->n_channels; i++) {
+ if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+ return -ENOBUFS;
+ }
nla_nest_end(msg, freqs);
if (req->n_match_sets) {
matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH);
+ if (!matches)
+ return -ENOBUFS;
+
for (i = 0; i < req->n_match_sets; i++) {
match = nla_nest_start(msg, i);
- nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
- req->match_sets[i].ssid.ssid_len,
- req->match_sets[i].ssid.ssid);
+ if (!match)
+ return -ENOBUFS;
+
+ if (nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
+ req->match_sets[i].ssid.ssid_len,
+ req->match_sets[i].ssid.ssid))
+ return -ENOBUFS;
nla_nest_end(msg, match);
}
nla_nest_end(msg, matches);
@@ -9436,6 +9725,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
for (i = 0; i < req->n_scan_plans; i++) {
scan_plan = nla_nest_start(msg, i + 1);
+ if (!scan_plan)
+ return -ENOBUFS;
+
if (!scan_plan ||
nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
req->scan_plans[i].interval) ||
@@ -9540,9 +9832,8 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev,
if (!rdev->wiphy.wowlan->tcp)
return -EINVAL;
- err = nla_parse(tb, MAX_NL80211_WOWLAN_TCP,
- nla_data(attr), nla_len(attr),
- nl80211_wowlan_tcp_policy);
+ err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TCP, attr,
+ nl80211_wowlan_tcp_policy);
if (err)
return err;
@@ -9687,9 +9978,7 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
goto out;
}
- err = nla_parse(tb, NL80211_ATTR_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_policy);
+ err = nla_parse_nested(tb, NL80211_ATTR_MAX, attr, nl80211_policy);
if (err)
goto out;
@@ -9723,10 +10012,9 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
goto set_wakeup;
}
- err = nla_parse(tb, MAX_NL80211_WOWLAN_TRIG,
- nla_data(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
- nla_len(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
- nl80211_wowlan_policy);
+ err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TRIG,
+ info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS],
+ nl80211_wowlan_policy);
if (err)
return err;
@@ -9808,8 +10096,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
rem) {
u8 *mask_pat;
- nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
- nla_len(pat), NULL);
+ nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
+ NULL);
err = -EINVAL;
if (!pat_tb[NL80211_PKTPAT_MASK] ||
!pat_tb[NL80211_PKTPAT_PATTERN])
@@ -10019,8 +10307,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
- err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule),
- nla_len(rule), nl80211_coalesce_policy);
+ err = nla_parse_nested(tb, NL80211_ATTR_COALESCE_RULE_MAX, rule,
+ nl80211_coalesce_policy);
if (err)
return err;
@@ -10058,8 +10346,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
rem) {
u8 *mask_pat;
- nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
- nla_len(pat), NULL);
+ nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL);
if (!pat_tb[NL80211_PKTPAT_MASK] ||
!pat_tb[NL80211_PKTPAT_PATTERN])
return -EINVAL;
@@ -10178,10 +10465,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_REKEY_DATA])
return -EINVAL;
- err = nla_parse(tb, MAX_NL80211_REKEY_DATA,
- nla_data(info->attrs[NL80211_ATTR_REKEY_DATA]),
- nla_len(info->attrs[NL80211_ATTR_REKEY_DATA]),
- nl80211_rekey_policy);
+ err = nla_parse_nested(tb, MAX_NL80211_REKEY_DATA,
+ info->attrs[NL80211_ATTR_REKEY_DATA],
+ nl80211_rekey_policy);
if (err)
return err;
@@ -10330,7 +10616,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
return -EOPNOTSUPP;
- if (wdev->p2p_started)
+ if (wdev_running(wdev))
return 0;
if (rfkill_blocked(rdev->rfkill))
@@ -10340,7 +10626,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- wdev->p2p_started = true;
+ wdev->is_running = true;
rdev->opencount++;
return 0;
@@ -10362,6 +10648,547 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info)
return 0;
}
+static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct cfg80211_nan_conf conf = {};
+ int err;
+
+ if (wdev->iftype != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ if (wdev_running(wdev))
+ return -EEXIST;
+
+ if (rfkill_blocked(rdev->rfkill))
+ return -ERFKILL;
+
+ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_NAN_DUAL])
+ return -EINVAL;
+
+ conf.master_pref =
+ nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
+ if (!conf.master_pref)
+ return -EINVAL;
+
+ conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+
+ err = rdev_start_nan(rdev, wdev, &conf);
+ if (err)
+ return err;
+
+ wdev->is_running = true;
+ rdev->opencount++;
+
+ return 0;
+}
+
+static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+
+ if (wdev->iftype != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ cfg80211_stop_nan(rdev, wdev);
+
+ return 0;
+}
+
+static int validate_nan_filter(struct nlattr *filter_attr)
+{
+ struct nlattr *attr;
+ int len = 0, n_entries = 0, rem;
+
+ nla_for_each_nested(attr, filter_attr, rem) {
+ len += nla_len(attr);
+ n_entries++;
+ }
+
+ if (len >= U8_MAX)
+ return -EINVAL;
+
+ return n_entries;
+}
+
+static int handle_nan_filter(struct nlattr *attr_filter,
+ struct cfg80211_nan_func *func,
+ bool tx)
+{
+ struct nlattr *attr;
+ int n_entries, rem, i;
+ struct cfg80211_nan_func_filter *filter;
+
+ n_entries = validate_nan_filter(attr_filter);
+ if (n_entries < 0)
+ return n_entries;
+
+ BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters));
+
+ filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL);
+ if (!filter)
+ return -ENOMEM;
+
+ i = 0;
+ nla_for_each_nested(attr, attr_filter, rem) {
+ filter[i].filter = nla_memdup(attr, GFP_KERNEL);
+ filter[i].len = nla_len(attr);
+ i++;
+ }
+ if (tx) {
+ func->num_tx_filters = n_entries;
+ func->tx_filters = filter;
+ } else {
+ func->num_rx_filters = n_entries;
+ func->rx_filters = filter;
+ }
+
+ return 0;
+}
+
+static int nl80211_nan_add_func(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct nlattr *tb[NUM_NL80211_NAN_FUNC_ATTR], *func_attr;
+ struct cfg80211_nan_func *func;
+ struct sk_buff *msg = NULL;
+ void *hdr = NULL;
+ int err = 0;
+
+ if (wdev->iftype != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ if (!wdev_running(wdev))
+ return -ENOTCONN;
+
+ if (!info->attrs[NL80211_ATTR_NAN_FUNC])
+ return -EINVAL;
+
+ if (wdev->owner_nlportid &&
+ wdev->owner_nlportid != info->snd_portid)
+ return -ENOTCONN;
+
+ err = nla_parse_nested(tb, NL80211_NAN_FUNC_ATTR_MAX,
+ info->attrs[NL80211_ATTR_NAN_FUNC],
+ nl80211_nan_func_policy);
+ if (err)
+ return err;
+
+ func = kzalloc(sizeof(*func), GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+
+ func->cookie = wdev->wiphy->cookie_counter++;
+
+ if (!tb[NL80211_NAN_FUNC_TYPE] ||
+ nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+
+ func->type = nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]);
+
+ if (!tb[NL80211_NAN_FUNC_SERVICE_ID]) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(func->service_id, nla_data(tb[NL80211_NAN_FUNC_SERVICE_ID]),
+ sizeof(func->service_id));
+
+ func->close_range =
+ nla_get_flag(tb[NL80211_NAN_FUNC_CLOSE_RANGE]);
+
+ if (tb[NL80211_NAN_FUNC_SERVICE_INFO]) {
+ func->serv_spec_info_len =
+ nla_len(tb[NL80211_NAN_FUNC_SERVICE_INFO]);
+ func->serv_spec_info =
+ kmemdup(nla_data(tb[NL80211_NAN_FUNC_SERVICE_INFO]),
+ func->serv_spec_info_len,
+ GFP_KERNEL);
+ if (!func->serv_spec_info) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (tb[NL80211_NAN_FUNC_TTL])
+ func->ttl = nla_get_u32(tb[NL80211_NAN_FUNC_TTL]);
+
+ switch (func->type) {
+ case NL80211_NAN_FUNC_PUBLISH:
+ if (!tb[NL80211_NAN_FUNC_PUBLISH_TYPE]) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ func->publish_type =
+ nla_get_u8(tb[NL80211_NAN_FUNC_PUBLISH_TYPE]);
+ func->publish_bcast =
+ nla_get_flag(tb[NL80211_NAN_FUNC_PUBLISH_BCAST]);
+
+ if ((!(func->publish_type & NL80211_NAN_SOLICITED_PUBLISH)) &&
+ func->publish_bcast) {
+ err = -EINVAL;
+ goto out;
+ }
+ break;
+ case NL80211_NAN_FUNC_SUBSCRIBE:
+ func->subscribe_active =
+ nla_get_flag(tb[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE]);
+ break;
+ case NL80211_NAN_FUNC_FOLLOW_UP:
+ if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] ||
+ !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ func->followup_id =
+ nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_ID]);
+ func->followup_reqid =
+ nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]);
+ memcpy(func->followup_dest.addr,
+ nla_data(tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]),
+ sizeof(func->followup_dest.addr));
+ if (func->ttl) {
+ err = -EINVAL;
+ goto out;
+ }
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (tb[NL80211_NAN_FUNC_SRF]) {
+ struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR];
+
+ err = nla_parse_nested(srf_tb, NL80211_NAN_SRF_ATTR_MAX,
+ tb[NL80211_NAN_FUNC_SRF],
+ nl80211_nan_srf_policy);
+ if (err)
+ goto out;
+
+ func->srf_include =
+ nla_get_flag(srf_tb[NL80211_NAN_SRF_INCLUDE]);
+
+ if (srf_tb[NL80211_NAN_SRF_BF]) {
+ if (srf_tb[NL80211_NAN_SRF_MAC_ADDRS] ||
+ !srf_tb[NL80211_NAN_SRF_BF_IDX]) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ func->srf_bf_len =
+ nla_len(srf_tb[NL80211_NAN_SRF_BF]);
+ func->srf_bf =
+ kmemdup(nla_data(srf_tb[NL80211_NAN_SRF_BF]),
+ func->srf_bf_len, GFP_KERNEL);
+ if (!func->srf_bf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ func->srf_bf_idx =
+ nla_get_u8(srf_tb[NL80211_NAN_SRF_BF_IDX]);
+ } else {
+ struct nlattr *attr, *mac_attr =
+ srf_tb[NL80211_NAN_SRF_MAC_ADDRS];
+ int n_entries, rem, i = 0;
+
+ if (!mac_attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ n_entries = validate_acl_mac_addrs(mac_attr);
+ if (n_entries <= 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ func->srf_num_macs = n_entries;
+ func->srf_macs =
+ kzalloc(sizeof(*func->srf_macs) * n_entries,
+ GFP_KERNEL);
+ if (!func->srf_macs) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ nla_for_each_nested(attr, mac_attr, rem)
+ memcpy(func->srf_macs[i++].addr, nla_data(attr),
+ sizeof(*func->srf_macs));
+ }
+ }
+
+ if (tb[NL80211_NAN_FUNC_TX_MATCH_FILTER]) {
+ err = handle_nan_filter(tb[NL80211_NAN_FUNC_TX_MATCH_FILTER],
+ func, true);
+ if (err)
+ goto out;
+ }
+
+ if (tb[NL80211_NAN_FUNC_RX_MATCH_FILTER]) {
+ err = handle_nan_filter(tb[NL80211_NAN_FUNC_RX_MATCH_FILTER],
+ func, false);
+ if (err)
+ goto out;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_ADD_NAN_FUNCTION);
+ /* This can't really happen - we just allocated 4KB */
+ if (WARN_ON(!hdr)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = rdev_add_nan_func(rdev, wdev, func);
+out:
+ if (err < 0) {
+ cfg80211_free_nan_func(func);
+ nlmsg_free(msg);
+ return err;
+ }
+
+ /* propagate the instance id and cookie to userspace */
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, func->cookie,
+ NL80211_ATTR_PAD))
+ goto nla_put_failure;
+
+ func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC);
+ if (!func_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID,
+ func->instance_id))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, func_attr);
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+static int nl80211_nan_del_func(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ u64 cookie;
+
+ if (wdev->iftype != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ if (!wdev_running(wdev))
+ return -ENOTCONN;
+
+ if (!info->attrs[NL80211_ATTR_COOKIE])
+ return -EINVAL;
+
+ if (wdev->owner_nlportid &&
+ wdev->owner_nlportid != info->snd_portid)
+ return -ENOTCONN;
+
+ cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
+
+ rdev_del_nan_func(rdev, wdev, cookie);
+
+ return 0;
+}
+
+static int nl80211_nan_change_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct cfg80211_nan_conf conf = {};
+ u32 changed = 0;
+
+ if (wdev->iftype != NL80211_IFTYPE_NAN)
+ return -EOPNOTSUPP;
+
+ if (!wdev_running(wdev))
+ return -ENOTCONN;
+
+ if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
+ conf.master_pref =
+ nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
+ if (conf.master_pref <= 1 || conf.master_pref == 255)
+ return -EINVAL;
+
+ changed |= CFG80211_NAN_CONF_CHANGED_PREF;
+ }
+
+ if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
+ conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+ changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
+ }
+
+ if (!changed)
+ return -EINVAL;
+
+ return rdev_nan_change_conf(rdev, wdev, &conf, changed);
+}
+
+void cfg80211_nan_match(struct wireless_dev *wdev,
+ struct cfg80211_nan_match_params *match, gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct nlattr *match_attr, *local_func_attr, *peer_func_attr;
+ struct sk_buff *msg;
+ void *hdr;
+
+ if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_MATCH);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ wdev->netdev->ifindex)) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, match->cookie,
+ NL80211_ATTR_PAD) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr))
+ goto nla_put_failure;
+
+ match_attr = nla_nest_start(msg, NL80211_ATTR_NAN_MATCH);
+ if (!match_attr)
+ goto nla_put_failure;
+
+ local_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_LOCAL);
+ if (!local_func_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->inst_id))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, local_func_attr);
+
+ peer_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_PEER);
+ if (!peer_func_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL80211_NAN_FUNC_TYPE, match->type) ||
+ nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->peer_inst_id))
+ goto nla_put_failure;
+
+ if (match->info && match->info_len &&
+ nla_put(msg, NL80211_NAN_FUNC_SERVICE_INFO, match->info_len,
+ match->info))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, peer_func_attr);
+ nla_nest_end(msg, match_attr);
+ genlmsg_end(msg, hdr);
+
+ if (!wdev->owner_nlportid)
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+ msg, 0, NL80211_MCGRP_NAN, gfp);
+ else
+ genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
+ wdev->owner_nlportid);
+
+ return;
+
+nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_nan_match);
+
+void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
+ u8 inst_id,
+ enum nl80211_nan_func_term_reason reason,
+ u64 cookie, gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ struct nlattr *func_attr;
+ void *hdr;
+
+ if (WARN_ON(!inst_id))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_NAN_FUNCTION);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ wdev->netdev->ifindex)) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+ NL80211_ATTR_PAD))
+ goto nla_put_failure;
+
+ func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC);
+ if (!func_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, inst_id) ||
+ nla_put_u8(msg, NL80211_NAN_FUNC_TERM_REASON, reason))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, func_attr);
+ genlmsg_end(msg, hdr);
+
+ if (!wdev->owner_nlportid)
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+ msg, 0, NL80211_MCGRP_NAN, gfp);
+ else
+ genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
+ wdev->owner_nlportid);
+
+ return;
+
+nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_nan_func_terminated);
+
static int nl80211_get_protocol_features(struct sk_buff *skb,
struct genl_info *info)
{
@@ -10513,10 +11340,7 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
- if (wdev->netdev &&
- !netif_running(wdev->netdev))
- return -ENETDOWN;
- if (!wdev->netdev && !wdev->p2p_started)
+ if (!wdev_running(wdev))
return -ENETDOWN;
}
@@ -10546,6 +11370,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
struct cfg80211_registered_device **rdev,
struct wireless_dev **wdev)
{
+ struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
u32 vid, subcmd;
unsigned int i;
int vcmd_idx = -1;
@@ -10581,31 +11406,28 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
}
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
- nl80211_fam.attrbuf, nl80211_fam.maxattr,
- nl80211_policy);
+ attrbuf, nl80211_fam.maxattr, nl80211_policy);
if (err)
goto out_unlock;
- if (!nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID] ||
- !nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) {
+ if (!attrbuf[NL80211_ATTR_VENDOR_ID] ||
+ !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) {
err = -EINVAL;
goto out_unlock;
}
- *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
+ *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf);
if (IS_ERR(*wdev))
*wdev = NULL;
- *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
+ *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
if (IS_ERR(*rdev)) {
err = PTR_ERR(*rdev);
goto out_unlock;
}
- vid = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID]);
- subcmd = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
+ vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]);
+ subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) {
const struct wiphy_vendor_command *vcmd;
@@ -10629,9 +11451,9 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
goto out_unlock;
}
- if (nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]) {
- data = nla_data(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]);
- data_len = nla_len(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]);
+ if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
+ data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
+ data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);
}
/* 0 is the first index - add 1 to parse only once */
@@ -10679,10 +11501,7 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
return -EINVAL;
if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
- if (wdev->netdev &&
- !netif_running(wdev->netdev))
- return -ENETDOWN;
- if (!wdev->netdev && !wdev->p2p_started)
+ if (!wdev_running(wdev))
return -ENETDOWN;
}
}
@@ -10995,6 +11814,31 @@ static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
return 0;
}
+static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const struct nlattr *nla;
+ bool enabled;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!rdev->ops->set_multicast_to_unicast)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ nla = info->attrs[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED];
+ enabled = nla_get_flag(nla);
+
+ return rdev_set_multicast_to_unicast(rdev, dev, enabled);
+}
+
#define NL80211_FLAG_NEED_WIPHY 0x01
#define NL80211_FLAG_NEED_NETDEV 0x02
#define NL80211_FLAG_NEED_RTNL 0x04
@@ -11053,22 +11897,15 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
info->user_ptr[1] = wdev;
}
- if (dev) {
- if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
- !netif_running(dev)) {
- if (rtnl)
- rtnl_unlock();
- return -ENETDOWN;
- }
+ if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
+ !wdev_running(wdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+ if (dev)
dev_hold(dev);
- } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) {
- if (!wdev->p2p_started) {
- if (rtnl)
- rtnl_unlock();
- return -ENETDOWN;
- }
- }
info->user_ptr[0] = rdev;
}
@@ -11441,6 +12278,14 @@ static const struct genl_ops nl80211_ops[] = {
NL80211_FLAG_NEED_RTNL,
},
{
+ .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS,
+ .doit = nl80211_update_connect_params,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
.cmd = NL80211_CMD_DISCONNECT,
.doit = nl80211_disconnect,
.policy = nl80211_policy,
@@ -11697,6 +12542,46 @@ static const struct genl_ops nl80211_ops[] = {
NL80211_FLAG_NEED_RTNL,
},
{
+ .cmd = NL80211_CMD_START_NAN,
+ .doit = nl80211_start_nan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_STOP_NAN,
+ .doit = nl80211_stop_nan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_ADD_NAN_FUNCTION,
+ .doit = nl80211_nan_add_func,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_NAN_FUNCTION,
+ .doit = nl80211_nan_del_func,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_CHANGE_NAN_CONFIG,
+ .doit = nl80211_nan_change_config,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
.cmd = NL80211_CMD_SET_MCAST_RATE,
.doit = nl80211_set_mcast_rate,
.policy = nl80211_policy,
@@ -11821,6 +12706,29 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST,
+ .doit = nl80211_set_multicast_to_unicast,
+ .policy = nl80211_policy,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+};
+
+static struct genl_family nl80211_fam __ro_after_init = {
+ .name = NL80211_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL80211_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = nl80211_pre_doit,
+ .post_doit = nl80211_post_doit,
+ .module = THIS_MODULE,
+ .ops = nl80211_ops,
+ .n_ops = ARRAY_SIZE(nl80211_ops),
+ .mcgrps = nl80211_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
};
/* notification functions */
@@ -11847,6 +12755,29 @@ void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
NL80211_MCGRP_CONFIG, GFP_KERNEL);
}
+void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ enum nl80211_commands cmd)
+{
+ struct sk_buff *msg;
+
+ WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE &&
+ cmd != NL80211_CMD_DEL_INTERFACE);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev,
+ cmd == NL80211_CMD_DEL_INTERFACE) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_CONFIG, GFP_KERNEL);
+}
+
static int nl80211_add_scan_req(struct sk_buff *msg,
struct cfg80211_registered_device *rdev)
{
@@ -13587,13 +14518,17 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
bool schedule_destroy_work = false;
- bool schedule_scan_stop = false;
struct cfg80211_sched_scan_request *sched_scan_req =
rcu_dereference(rdev->sched_scan_req);
if (sched_scan_req && notify->portid &&
- sched_scan_req->owner_nlportid == notify->portid)
- schedule_scan_stop = true;
+ sched_scan_req->owner_nlportid == notify->portid) {
+ sched_scan_req->owner_nlportid = 0;
+
+ if (rdev->ops->sched_scan_stop &&
+ rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ schedule_work(&rdev->sched_scan_stop_wk);
+ }
list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) {
cfg80211_mlme_unregister_socket(wdev, notify->portid);
@@ -13624,12 +14559,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
spin_unlock(&rdev->destroy_list_lock);
schedule_work(&rdev->destroy_work);
}
- } else if (schedule_scan_stop) {
- sched_scan_req->owner_nlportid = 0;
-
- if (rdev->ops->sched_scan_stop &&
- rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
- schedule_work(&rdev->sched_scan_stop_wk);
}
}
@@ -13762,12 +14691,11 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
/* initialisation/exit functions */
-int nl80211_init(void)
+int __init nl80211_init(void)
{
int err;
- err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops,
- nl80211_mcgrps);
+ err = genl_register_family(&nl80211_fam);
if (err)
return err;
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a63f402b10b7..7e3821d7fcc5 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -7,6 +7,9 @@ int nl80211_init(void);
void nl80211_exit(void);
void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
enum nl80211_commands cmd);
+void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ enum nl80211_commands cmd);
void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 85ff30bee2b9..2f425075ada8 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -490,6 +490,18 @@ static inline int rdev_connect(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_update_connect_params(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_connect_params *sme, u32 changed)
+{
+ int ret;
+ trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed);
+ ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
struct net_device *dev, u16 reason_code)
{
@@ -562,6 +574,18 @@ static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ const bool enabled)
+{
+ int ret;
+ trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
+ ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
{
trace_rdev_rfkill_poll(&rdev->wiphy);
@@ -887,6 +911,64 @@ static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
trace_rdev_return_void(&rdev->wiphy);
}
+static inline int rdev_start_nan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_conf *conf)
+{
+ int ret;
+
+ trace_rdev_start_nan(&rdev->wiphy, wdev, conf);
+ ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ trace_rdev_stop_nan(&rdev->wiphy, wdev);
+ rdev->ops->stop_nan(&rdev->wiphy, wdev);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+static inline int
+rdev_add_nan_func(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_func *nan_func)
+{
+ int ret;
+
+ trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func);
+ ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u64 cookie)
+{
+ trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie);
+ rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+static inline int
+rdev_nan_change_conf(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_conf *conf, u32 changes)
+{
+ int ret;
+
+ trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes);
+ if (rdev->ops->nan_change_conf)
+ ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf,
+ changes);
+ else
+ ret = -ENOTSUPP;
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
struct net_device *dev,
struct cfg80211_acl_data *params)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 0358e12be54b..35ad69fd0838 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -57,6 +57,19 @@
* also linked into the probe response struct.
*/
+/*
+ * Limit the number of BSS entries stored in mac80211. Each one is
+ * a bit over 4k at most, so this limits to roughly 4-5M of memory.
+ * If somebody wants to really attack this though, they'd likely
+ * use small beacons, and only one type of frame, limiting each of
+ * the entries to a much smaller size (in order to generate more
+ * entries in total, so overhead is bigger.)
+ */
+static int bss_entries_limit = 1000;
+module_param(bss_entries_limit, int, 0644);
+MODULE_PARM_DESC(bss_entries_limit,
+ "limit to number of scan BSS entries (per wiphy, default 1000)");
+
#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
static void bss_free(struct cfg80211_internal_bss *bss)
@@ -137,6 +150,10 @@ static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev,
list_del_init(&bss->list);
rb_erase(&bss->rbn, &rdev->bss_tree);
+ rdev->bss_entries--;
+ WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list),
+ "rdev bss entries[%d]/list[empty:%d] corruption\n",
+ rdev->bss_entries, list_empty(&rdev->bss_list));
bss_ref_put(rdev, bss);
return true;
}
@@ -163,6 +180,40 @@ static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev,
rdev->bss_generation++;
}
+static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_internal_bss *bss, *oldest = NULL;
+ bool ret;
+
+ lockdep_assert_held(&rdev->bss_lock);
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (atomic_read(&bss->hold))
+ continue;
+
+ if (!list_empty(&bss->hidden_list) &&
+ !bss->pub.hidden_beacon_bss)
+ continue;
+
+ if (oldest && time_before(oldest->ts, bss->ts))
+ continue;
+ oldest = bss;
+ }
+
+ if (WARN_ON(!oldest))
+ return false;
+
+ /*
+ * The callers make sure to increase rdev->bss_generation if anything
+ * gets removed (and a new entry added), so there's no need to also do
+ * it here.
+ */
+
+ ret = __cfg80211_unlink_bss(rdev, oldest);
+ WARN_ON(!ret);
+ return ret;
+}
+
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
bool send_message)
{
@@ -352,52 +403,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
__cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
}
-const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
+ const u8 *match, int match_len,
+ int match_offset)
{
- while (len > 2 && ies[0] != eid) {
+ /* match_offset can't be smaller than 2, unless match_len is
+ * zero, in which case match_offset must be zero as well.
+ */
+ if (WARN_ON((match_len && match_offset < 2) ||
+ (!match_len && match_offset)))
+ return NULL;
+
+ while (len >= 2 && len >= ies[1] + 2) {
+ if ((ies[0] == eid) &&
+ (ies[1] + 2 >= match_offset + match_len) &&
+ !memcmp(ies + match_offset, match, match_len))
+ return ies;
+
len -= ies[1] + 2;
ies += ies[1] + 2;
}
- if (len < 2)
- return NULL;
- if (len < 2 + ies[1])
- return NULL;
- return ies;
+
+ return NULL;
}
-EXPORT_SYMBOL(cfg80211_find_ie);
+EXPORT_SYMBOL(cfg80211_find_ie_match);
const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
const u8 *ies, int len)
{
- struct ieee80211_vendor_ie *ie;
- const u8 *pos = ies, *end = ies + len;
- int ie_oui;
+ const u8 *ie;
+ u8 match[] = { oui >> 16, oui >> 8, oui, oui_type };
+ int match_len = (oui_type < 0) ? 3 : sizeof(match);
if (WARN_ON(oui_type > 0xff))
return NULL;
- while (pos < end) {
- pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos,
- end - pos);
- if (!pos)
- return NULL;
-
- ie = (struct ieee80211_vendor_ie *)pos;
+ ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len,
+ match, match_len, 2);
- /* make sure we can access ie->len */
- BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1);
-
- if (ie->len < sizeof(*ie))
- goto cont;
+ if (ie && (ie[1] < 4))
+ return NULL;
- ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2];
- if (ie_oui == oui &&
- (oui_type < 0 || ie->oui_type == oui_type))
- return pos;
-cont:
- pos += 2 + ie->len;
- }
- return NULL;
+ return ie;
}
EXPORT_SYMBOL(cfg80211_find_vendor_ie);
@@ -693,6 +740,7 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
const u8 *ie;
int i, ssidlen;
u8 fold = 0;
+ u32 n_entries = 0;
ies = rcu_access_pointer(new->pub.beacon_ies);
if (WARN_ON(!ies))
@@ -716,6 +764,12 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
/* This is the bad part ... */
list_for_each_entry(bss, &rdev->bss_list, list) {
+ /*
+ * we're iterating all the entries anyway, so take the
+ * opportunity to validate the list length accounting
+ */
+ n_entries++;
+
if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid))
continue;
if (bss->pub.channel != new->pub.channel)
@@ -744,6 +798,10 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
new->pub.beacon_ies);
}
+ WARN_ONCE(n_entries != rdev->bss_entries,
+ "rdev bss entries[%d]/list[len:%d] corruption\n",
+ rdev->bss_entries, n_entries);
+
return true;
}
@@ -898,7 +956,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
}
}
+ if (rdev->bss_entries >= bss_entries_limit &&
+ !cfg80211_bss_expire_oldest(rdev)) {
+ kfree(new);
+ goto drop;
+ }
+
list_add_tail(&new->list, &rdev->bss_list);
+ rdev->bss_entries++;
rb_insert_bss(rdev, new);
found = new;
}
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index add6824c44fd..5e0d19380302 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -39,6 +39,7 @@ struct cfg80211_conn {
CFG80211_CONN_ASSOCIATING,
CFG80211_CONN_ASSOC_FAILED,
CFG80211_CONN_DEAUTH,
+ CFG80211_CONN_ABANDON,
CFG80211_CONN_CONNECTED,
} state;
u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
@@ -206,6 +207,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
NULL, 0,
WLAN_REASON_DEAUTH_LEAVING, false);
+ /* fall through */
+ case CFG80211_CONN_ABANDON:
/* free directly, disconnected event already sent */
cfg80211_sme_free(wdev);
return 0;
@@ -423,6 +426,17 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
schedule_work(&rdev->conn_work);
}
+void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (!wdev->conn)
+ return;
+
+ wdev->conn->state = CFG80211_CONN_ABANDON;
+ schedule_work(&rdev->conn_work);
+}
+
static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev,
const u8 *ies, size_t ies_len,
const u8 **out_ies, size_t *out_ies_len)
@@ -726,7 +740,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
wdev->current_bss = bss_from_pub(bss);
- cfg80211_upload_connect_keys(wdev);
+ if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
+ cfg80211_upload_connect_keys(wdev);
rcu_read_lock();
country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY);
@@ -1043,6 +1058,12 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
connect->crypto.ciphers_pairwise[0] = cipher;
}
}
+
+ connect->crypto.wep_keys = connkeys->params;
+ connect->crypto.wep_tx_key = connkeys->def;
+ } else {
+ if (WARN_ON(connkeys))
+ return -EINVAL;
}
wdev->connect_keys = connkeys;
@@ -1081,7 +1102,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
err = cfg80211_sme_disconnect(wdev, reason);
else if (!rdev->ops->disconnect)
cfg80211_mlme_down(rdev, dev);
- else if (wdev->current_bss)
+ else if (wdev->ssid_len)
err = rdev_disconnect(rdev, dev, reason);
return err;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index e46469bc130f..14b3f007826d 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -57,7 +57,7 @@ static ssize_t addresses_show(struct device *dev,
return sprintf(buf, "%pM\n", wiphy->perm_addr);
for (i = 0; i < wiphy->n_addresses; i++)
- buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr);
+ buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr);
return buf - start;
}
@@ -104,13 +104,16 @@ static int wiphy_suspend(struct device *dev)
rtnl_lock();
if (rdev->wiphy.registered) {
- if (!rdev->wiphy.wowlan_config)
+ if (!rdev->wiphy.wowlan_config) {
cfg80211_leave_all(rdev);
+ cfg80211_process_rdev_events(rdev);
+ }
if (rdev->ops->suspend)
ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config);
if (ret == 1) {
/* Driver refuse to configure wowlan */
cfg80211_leave_all(rdev);
+ cfg80211_process_rdev_events(rdev);
ret = rdev_suspend(rdev, NULL);
}
}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 72b5255cefe2..ea1b47e04fa4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1281,6 +1281,24 @@ TRACE_EVENT(rdev_connect,
__entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid))
);
+TRACE_EVENT(rdev_update_connect_params,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_connect_params *sme, u32 changed),
+ TP_ARGS(wiphy, netdev, sme, changed),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u32, changed)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->changed = changed;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->changed)
+);
+
TRACE_EVENT(rdev_set_cqm_rssi_config,
TP_PROTO(struct wiphy *wiphy,
struct net_device *netdev, s32 rssi_thold,
@@ -1889,6 +1907,96 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
TP_ARGS(wiphy, wdev)
);
+TRACE_EVENT(rdev_start_nan,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_nan_conf *conf),
+ TP_ARGS(wiphy, wdev, conf),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u8, master_pref)
+ __field(u8, dual);
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->master_pref = conf->master_pref;
+ __entry->dual = conf->dual;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
+ ", master preference: %u, dual: %d",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
+ __entry->dual)
+);
+
+TRACE_EVENT(rdev_nan_change_conf,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_nan_conf *conf, u32 changes),
+ TP_ARGS(wiphy, wdev, conf, changes),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u8, master_pref)
+ __field(u8, dual);
+ __field(u32, changes);
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->master_pref = conf->master_pref;
+ __entry->dual = conf->dual;
+ __entry->changes = changes;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
+ ", master preference: %u, dual: %d, changes: %x",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
+ __entry->dual, __entry->changes)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_add_nan_func,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ const struct cfg80211_nan_func *func),
+ TP_ARGS(wiphy, wdev, func),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u8, func_type)
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->func_type = func->type;
+ __entry->cookie = func->cookie
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type,
+ __entry->cookie)
+);
+
+TRACE_EVENT(rdev_del_nan_func,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
+);
+
TRACE_EVENT(rdev_set_mac_acl,
TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
struct cfg80211_acl_data *params),
@@ -2940,6 +3048,25 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
TP_ARGS(wiphy, wdev)
);
+
+TRACE_EVENT(rdev_set_multicast_to_unicast,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const bool enabled),
+ TP_ARGS(wiphy, netdev, enabled),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(bool, enabled)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ BOOL_TO_STR(__entry->enabled))
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index b7d1592bd5b8..e9d040d29846 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -13,6 +13,7 @@
#include <net/dsfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
+#include <linux/gcd.h>
#include "core.h"
#include "rdev-ops.h"
@@ -218,7 +219,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
struct key_params *params, int key_idx,
bool pairwise, const u8 *mac_addr)
{
- if (key_idx > 5)
+ if (key_idx < 0 || key_idx > 5)
return -EINVAL;
if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
@@ -249,7 +250,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
/* Disallow BIP (group-only) cipher as pairwise cipher */
if (pairwise)
return -EINVAL;
+ if (key_idx < 4)
+ return -EINVAL;
break;
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ if (key_idx > 3)
+ return -EINVAL;
default:
break;
}
@@ -414,8 +421,8 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
}
EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen);
-static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr,
- const u8 *addr, enum nl80211_iftype iftype)
+int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
+ const u8 *addr, enum nl80211_iftype iftype)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
struct {
@@ -519,13 +526,7 @@ static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr,
return 0;
}
-
-int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
- enum nl80211_iftype iftype)
-{
- return __ieee80211_data_to_8023(skb, NULL, addr, iftype);
-}
-EXPORT_SYMBOL(ieee80211_data_to_8023);
+EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr);
int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
enum nl80211_iftype iftype,
@@ -740,24 +741,18 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
const u8 *addr, enum nl80211_iftype iftype,
const unsigned int extra_headroom,
- bool has_80211_header)
+ const u8 *check_da, const u8 *check_sa)
{
unsigned int hlen = ALIGN(extra_headroom, 4);
struct sk_buff *frame = NULL;
u16 ethertype;
u8 *payload;
- int offset = 0, remaining, err;
+ int offset = 0, remaining;
struct ethhdr eth;
bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
bool reuse_skb = false;
bool last = false;
- if (has_80211_header) {
- err = __ieee80211_data_to_8023(skb, &eth, addr, iftype);
- if (err)
- goto out;
- }
-
while (!last) {
unsigned int subframe_len;
int len;
@@ -774,8 +769,17 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
goto purge;
offset += sizeof(struct ethhdr);
- /* reuse skb for the last subframe */
last = remaining <= subframe_len + padding;
+
+ /* FIXME: should we really accept multicast DA? */
+ if ((check_da && !is_multicast_ether_addr(eth.h_dest) &&
+ !ether_addr_equal(check_da, eth.h_dest)) ||
+ (check_sa && !ether_addr_equal(check_sa, eth.h_source))) {
+ offset += len + padding;
+ continue;
+ }
+
+ /* reuse skb for the last subframe */
if (!skb_is_nonlinear(skb) && !reuse_frag && last) {
skb_pull(skb, offset);
frame = skb;
@@ -813,7 +817,6 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
purge:
__skb_queue_purge(list);
- out:
dev_kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_amsdu_to_8023s);
@@ -906,7 +909,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
if (!wdev->connect_keys)
return;
- for (i = 0; i < 6; i++) {
+ for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) {
if (!wdev->connect_keys->params[i].cipher)
continue;
if (rdev_add_key(rdev, dev, i, false, NULL,
@@ -919,9 +922,6 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
netdev_err(dev, "failed to set defkey %d\n", i);
continue;
}
- if (wdev->connect_keys->defmgmt == i)
- if (rdev_set_default_mgmt_key(rdev, dev, i))
- netdev_err(dev, "failed to set mgtdef %d\n", i);
}
kzfree(wdev->connect_keys);
@@ -1005,8 +1005,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
if (otype == NL80211_IFTYPE_AP_VLAN)
return -EOPNOTSUPP;
- /* cannot change into P2P device type */
- if (ntype == NL80211_IFTYPE_P2P_DEVICE)
+ /* cannot change into P2P device or NAN */
+ if (ntype == NL80211_IFTYPE_P2P_DEVICE ||
+ ntype == NL80211_IFTYPE_NAN)
return -EOPNOTSUPP;
if (!rdev->ops->change_virtual_intf ||
@@ -1085,6 +1086,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
/* not happening */
break;
case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
WARN_ON(1);
break;
}
@@ -1157,7 +1159,8 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
58500000,
65000000,
78000000,
- 0,
+ /* not in the spec, but some devices use this: */
+ 86500000,
},
{ 13500000,
27000000,
@@ -1376,6 +1379,25 @@ static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id)
return false;
}
+static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos)
+{
+ /* we assume a validly formed IEs buffer */
+ u8 len = ies[pos + 1];
+
+ pos += 2 + len;
+
+ /* the IE itself must have 255 bytes for fragments to follow */
+ if (len < 255)
+ return pos;
+
+ while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) {
+ len = ies[pos + 1];
+ pos += 2 + len;
+ }
+
+ return pos;
+}
+
size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
const u8 *ids, int n_ids,
const u8 *after_ric, int n_after_ric,
@@ -1385,14 +1407,14 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) {
if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) {
- pos += 2 + ies[pos + 1];
+ pos = skip_ie(ies, ielen, pos);
while (pos < ielen &&
!ieee80211_id_in_list(after_ric, n_after_ric,
ies[pos]))
- pos += 2 + ies[pos + 1];
+ pos = skip_ie(ies, ielen, pos);
} else {
- pos += 2 + ies[pos + 1];
+ pos = skip_ie(ies, ielen, pos);
}
}
@@ -1553,31 +1575,57 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
}
EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);
-int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
- u32 beacon_int)
+static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
+ u32 *beacon_int_gcd,
+ bool *beacon_int_different)
{
struct wireless_dev *wdev;
- int res = 0;
- if (!beacon_int)
- return -EINVAL;
+ *beacon_int_gcd = 0;
+ *beacon_int_different = false;
- list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
+ list_for_each_entry(wdev, &wiphy->wdev_list, list) {
if (!wdev->beacon_interval)
continue;
- if (wdev->beacon_interval != beacon_int) {
- res = -EINVAL;
- break;
+
+ if (!*beacon_int_gcd) {
+ *beacon_int_gcd = wdev->beacon_interval;
+ continue;
}
+
+ if (wdev->beacon_interval == *beacon_int_gcd)
+ continue;
+
+ *beacon_int_different = true;
+ *beacon_int_gcd = gcd(*beacon_int_gcd, wdev->beacon_interval);
}
- return res;
+ if (new_beacon_int && *beacon_int_gcd != new_beacon_int) {
+ if (*beacon_int_gcd)
+ *beacon_int_different = true;
+ *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int);
+ }
+}
+
+int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
+ enum nl80211_iftype iftype, u32 beacon_int)
+{
+ /*
+ * This is just a basic pre-condition check; if interface combinations
+ * are possible the driver must already be checking those with a call
+ * to cfg80211_check_combinations(), in which case we'll validate more
+ * through the cfg80211_calculate_bi_data() call and code in
+ * cfg80211_iter_combinations().
+ */
+
+ if (beacon_int < 10 || beacon_int > 10000)
+ return -EINVAL;
+
+ return 0;
}
int cfg80211_iter_combinations(struct wiphy *wiphy,
- const int num_different_channels,
- const u8 radar_detect,
- const int iftype_num[NUM_NL80211_IFTYPES],
+ struct iface_combination_params *params,
void (*iter)(const struct ieee80211_iface_combination *c,
void *data),
void *data)
@@ -1587,8 +1635,23 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
int i, j, iftype;
int num_interfaces = 0;
u32 used_iftypes = 0;
+ u32 beacon_int_gcd;
+ bool beacon_int_different;
- if (radar_detect) {
+ /*
+ * This is a bit strange, since the iteration used to rely only on
+ * the data given by the driver, but here it now relies on context,
+ * in form of the currently operating interfaces.
+ * This is OK for all current users, and saves us from having to
+ * push the GCD calculations into all the drivers.
+ * In the future, this should probably rely more on data that's in
+ * cfg80211 already - the only thing not would appear to be any new
+ * interfaces (while being brought up) and channel/radar data.
+ */
+ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int,
+ &beacon_int_gcd, &beacon_int_different);
+
+ if (params->radar_detect) {
rcu_read_lock();
regdom = rcu_dereference(cfg80211_regdomain);
if (regdom)
@@ -1597,8 +1660,8 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
}
for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
- num_interfaces += iftype_num[iftype];
- if (iftype_num[iftype] > 0 &&
+ num_interfaces += params->iftype_num[iftype];
+ if (params->iftype_num[iftype] > 0 &&
!(wiphy->software_iftypes & BIT(iftype)))
used_iftypes |= BIT(iftype);
}
@@ -1612,7 +1675,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
if (num_interfaces > c->max_interfaces)
continue;
- if (num_different_channels > c->num_different_channels)
+ if (params->num_different_channels > c->num_different_channels)
continue;
limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits,
@@ -1627,16 +1690,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
all_iftypes |= limits[j].types;
if (!(limits[j].types & BIT(iftype)))
continue;
- if (limits[j].max < iftype_num[iftype])
+ if (limits[j].max < params->iftype_num[iftype])
goto cont;
- limits[j].max -= iftype_num[iftype];
+ limits[j].max -= params->iftype_num[iftype];
}
}
- if (radar_detect != (c->radar_detect_widths & radar_detect))
+ if (params->radar_detect !=
+ (c->radar_detect_widths & params->radar_detect))
goto cont;
- if (radar_detect && c->radar_detect_regions &&
+ if (params->radar_detect && c->radar_detect_regions &&
!(c->radar_detect_regions & BIT(region)))
goto cont;
@@ -1648,6 +1712,14 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
if ((all_iftypes & used_iftypes) != used_iftypes)
goto cont;
+ if (beacon_int_gcd) {
+ if (c->beacon_int_min_gcd &&
+ beacon_int_gcd < c->beacon_int_min_gcd)
+ goto cont;
+ if (!c->beacon_int_min_gcd && beacon_int_different)
+ goto cont;
+ }
+
/* This combination covered all interface types and
* supported the requested numbers, so we're good.
*/
@@ -1670,14 +1742,11 @@ cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c,
}
int cfg80211_check_combinations(struct wiphy *wiphy,
- const int num_different_channels,
- const u8 radar_detect,
- const int iftype_num[NUM_NL80211_IFTYPES])
+ struct iface_combination_params *params)
{
int err, num = 0;
- err = cfg80211_iter_combinations(wiphy, num_different_channels,
- radar_detect, iftype_num,
+ err = cfg80211_iter_combinations(wiphy, params,
cfg80211_iter_sum_ifcombs, &num);
if (err)
return err;
@@ -1757,6 +1826,28 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
}
EXPORT_SYMBOL(cfg80211_get_station);
+void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
+{
+ int i;
+
+ if (!f)
+ return;
+
+ kfree(f->serv_spec_info);
+ kfree(f->srf_bf);
+ kfree(f->srf_macs);
+ for (i = 0; i < f->num_rx_filters; i++)
+ kfree(f->rx_filters[i].filter);
+
+ for (i = 0; i < f->num_tx_filters; i++)
+ kfree(f->tx_filters[i].filter);
+
+ kfree(f->rx_filters);
+ kfree(f->tx_filters);
+ kfree(f);
+}
+EXPORT_SYMBOL(cfg80211_free_nan_func);
+
/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 9f27221c8913..a220156cf217 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -406,12 +406,16 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
if (pairwise && !addr)
return -EINVAL;
+ /*
+ * In many cases we won't actually need this, but it's better
+ * to do it first in case the allocation fails. Don't use wext.
+ */
if (!wdev->wext.keys) {
wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!wdev->wext.keys)
return -ENOMEM;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++)
wdev->wext.keys->params[i].key =
wdev->wext.keys->data[i];
}
@@ -460,7 +464,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
if (err == -ENOENT)
err = 0;
if (!err) {
- if (!addr) {
+ if (!addr && idx < 4) {
memset(wdev->wext.keys->data[idx], 0,
sizeof(wdev->wext.keys->data[idx]));
wdev->wext.keys->params[idx].key_len = 0;
@@ -487,10 +491,19 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
err = 0;
if (wdev->current_bss)
err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
+ else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
+ params->cipher != WLAN_CIPHER_SUITE_WEP104)
+ return -EINVAL;
if (err)
return err;
- if (!addr) {
+ /*
+ * We only need to store WEP keys, since they're the only keys that
+ * can be be set before a connection is established and persist after
+ * disconnecting.
+ */
+ if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
+ params->cipher == WLAN_CIPHER_SUITE_WEP104)) {
wdev->wext.keys->params[idx] = *params;
memcpy(wdev->wext.keys->data[idx],
params->key, params->key_len);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index a4e8af3321d2..995163830a61 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -35,7 +35,6 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
if (wdev->wext.keys) {
wdev->wext.keys->def = wdev->wext.default_key;
- wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
if (wdev->wext.default_key != -1)
wdev->wext.connect.privacy = true;
}
@@ -43,11 +42,11 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
if (!wdev->wext.connect.ssid_len)
return 0;
- if (wdev->wext.keys) {
+ if (wdev->wext.keys && wdev->wext.keys->def != -1) {
ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
if (!ck)
return -ENOMEM;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++)
ck->params[i].key = ck->data[i];
}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index a750f330b8dd..079c883aa96e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -51,7 +51,7 @@
#include <linux/slab.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/notifier.h>
@@ -1500,12 +1500,8 @@ out_fac_release:
goto out_dtefac_release;
if (dtefacs.calling_len > X25_MAX_AE_LEN)
goto out_dtefac_release;
- if (dtefacs.calling_ae == NULL)
- goto out_dtefac_release;
if (dtefacs.called_len > X25_MAX_AE_LEN)
goto out_dtefac_release;
- if (dtefacs.called_ae == NULL)
- goto out_dtefac_release;
x25->dte_facilities = dtefacs;
rc = 0;
out_dtefac_release:
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
index 43239527a205..a06dfe143c67 100644
--- a/net/x25/sysctl_net_x25.c
+++ b/net/x25/sysctl_net_x25.c
@@ -70,7 +70,7 @@ static struct ctl_table x25_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { 0, },
+ { },
};
void __init x25_register_sysctl(void)
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index fd5ffb25873f..bcaa180d6a3f 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -29,7 +29,7 @@
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <net/x25.h>
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 250e567ba3d6..44ac85fe2bc9 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -17,7 +17,7 @@
#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <net/xfrm.h>
-#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
+#if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP)
#include <net/esp.h>
#endif
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 45f9cf97ea25..177e208e8ff5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -49,6 +49,7 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
__read_mostly;
static struct kmem_cache *xfrm_dst_cache __read_mostly;
+static __read_mostly seqcount_t xfrm_policy_hash_generation;
static void xfrm_init_pmtu(struct dst_entry *dst);
static int stale_bundle(struct dst_entry *dst);
@@ -59,6 +60,11 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
+static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
+{
+ return atomic_inc_not_zero(&policy->refcnt);
+}
+
static inline bool
__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
@@ -385,9 +391,11 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
__get_hash_thresh(net, family, dir, &dbits, &sbits);
hash = __sel_hash(sel, family, hmask, dbits, sbits);
- return (hash == hmask + 1 ?
- &net->xfrm.policy_inexact[dir] :
- net->xfrm.policy_bydst[dir].table + hash);
+ if (hash == hmask + 1)
+ return &net->xfrm.policy_inexact[dir];
+
+ return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}
static struct hlist_head *policy_hash_direct(struct net *net,
@@ -403,7 +411,8 @@ static struct hlist_head *policy_hash_direct(struct net *net,
__get_hash_thresh(net, family, dir, &dbits, &sbits);
hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
- return net->xfrm.policy_bydst[dir].table + hash;
+ return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}
static void xfrm_dst_hash_transfer(struct net *net,
@@ -426,14 +435,14 @@ redo:
h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
pol->family, nhashmask, dbits, sbits);
if (!entry0) {
- hlist_del(&pol->bydst);
- hlist_add_head(&pol->bydst, ndsttable+h);
+ hlist_del_rcu(&pol->bydst);
+ hlist_add_head_rcu(&pol->bydst, ndsttable + h);
h0 = h;
} else {
if (h != h0)
continue;
- hlist_del(&pol->bydst);
- hlist_add_behind(&pol->bydst, entry0);
+ hlist_del_rcu(&pol->bydst);
+ hlist_add_behind_rcu(&pol->bydst, entry0);
}
entry0 = &pol->bydst;
}
@@ -468,22 +477,32 @@ static void xfrm_bydst_resize(struct net *net, int dir)
unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
unsigned int nhashmask = xfrm_new_hash_mask(hmask);
unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
- struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
struct hlist_head *ndst = xfrm_hash_alloc(nsize);
+ struct hlist_head *odst;
int i;
if (!ndst)
return;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ write_seqcount_begin(&xfrm_policy_hash_generation);
+
+ odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
+
+ odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
for (i = hmask; i >= 0; i--)
xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
- net->xfrm.policy_bydst[dir].table = ndst;
+ rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
net->xfrm.policy_bydst[dir].hmask = nhashmask;
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ write_seqcount_end(&xfrm_policy_hash_generation);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ synchronize_rcu();
xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}
@@ -500,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
if (!nidx)
return;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
for (i = hmask; i >= 0; i--)
xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
@@ -508,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
net->xfrm.policy_byidx = nidx;
net->xfrm.policy_idx_hmask = nhashmask;
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}
@@ -541,7 +560,6 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total)
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
- read_lock_bh(&net->xfrm.xfrm_policy_lock);
si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
@@ -550,7 +568,6 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
si->spdhcnt = net->xfrm.policy_idx_hmask;
si->spdhmcnt = xfrm_policy_hashmax;
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_spd_getinfo);
@@ -600,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
rbits6 = net->xfrm.policy_hthresh.rbits6;
} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
/* reset the bydst and inexact table in all directions */
for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
@@ -646,7 +663,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
hlist_add_head(&policy->bydst, chain);
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex);
}
@@ -757,7 +774,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
struct hlist_head *chain;
struct hlist_node *newpos;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
delpol = NULL;
newpos = NULL;
@@ -768,7 +785,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
xfrm_sec_ctx_match(pol->security, policy->security) &&
!WARN_ON(delpol)) {
if (excl) {
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return -EEXIST;
}
delpol = pol;
@@ -804,7 +821,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
policy->curlft.use_time = 0;
if (!mod_timer(&policy->timer, jiffies + HZ))
xfrm_pol_hold(policy);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (delpol)
xfrm_policy_kill(delpol);
@@ -824,7 +841,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
struct hlist_head *chain;
*err = 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, sel, sel->family, dir);
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
@@ -837,7 +854,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
*err = security_xfrm_policy_delete(
pol->security);
if (*err) {
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
__xfrm_policy_unlink(pol, dir);
@@ -846,7 +863,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
break;
}
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
@@ -865,7 +882,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
return NULL;
*err = 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = net->xfrm.policy_byidx + idx_hash(net, id);
ret = NULL;
hlist_for_each_entry(pol, chain, byidx) {
@@ -876,7 +893,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
*err = security_xfrm_policy_delete(
pol->security);
if (*err) {
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
__xfrm_policy_unlink(pol, dir);
@@ -885,7 +902,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
break;
}
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
@@ -943,7 +960,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
int dir, err = 0, cnt = 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
err = xfrm_policy_flush_secctx_check(net, type, task_valid);
if (err)
@@ -959,14 +976,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (pol->type != type)
continue;
__xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
cnt++;
xfrm_audit_policy_delete(pol, 1, task_valid);
xfrm_policy_kill(pol);
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
goto again1;
}
@@ -978,13 +995,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (pol->type != type)
continue;
__xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
cnt++;
xfrm_audit_policy_delete(pol, 1, task_valid);
xfrm_policy_kill(pol);
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
goto again2;
}
}
@@ -993,7 +1010,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (!cnt)
err = -ESRCH;
out:
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);
@@ -1013,7 +1030,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
if (list_empty(&walk->walk.all) && walk->seq != 0)
return 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
if (list_empty(&walk->walk.all))
x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
else
@@ -1041,7 +1058,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
}
list_del_init(&walk->walk.all);
out:
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);
@@ -1060,9 +1077,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
if (list_empty(&walk->walk.all))
return;
- write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
list_del(&walk->walk.all);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);
@@ -1100,17 +1117,24 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
struct xfrm_policy *pol, *ret;
const xfrm_address_t *daddr, *saddr;
struct hlist_head *chain;
- u32 priority = ~0U;
+ unsigned int sequence;
+ u32 priority;
daddr = xfrm_flowi_daddr(fl, family);
saddr = xfrm_flowi_saddr(fl, family);
if (unlikely(!daddr || !saddr))
return NULL;
- read_lock_bh(&net->xfrm.xfrm_policy_lock);
- chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ rcu_read_lock();
+ retry:
+ do {
+ sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+ priority = ~0U;
ret = NULL;
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
err = xfrm_policy_match(pol, fl, type, family, dir);
if (err) {
if (err == -ESRCH)
@@ -1126,7 +1150,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
}
chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
if ((pol->priority >= priority) && ret)
break;
@@ -1144,9 +1168,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
}
- xfrm_pol_hold(ret);
+ if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+ goto retry;
+
+ if (ret && !xfrm_pol_hold_rcu(ret))
+ goto retry;
fail:
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ rcu_read_unlock();
return ret;
}
@@ -1223,10 +1251,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
const struct flowi *fl)
{
struct xfrm_policy *pol;
- struct net *net = sock_net(sk);
rcu_read_lock();
- read_lock_bh(&net->xfrm.xfrm_policy_lock);
+ again:
pol = rcu_dereference(sk->sk_policy[dir]);
if (pol != NULL) {
bool match = xfrm_selector_match(&pol->selector, fl,
@@ -1241,17 +1268,18 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
err = security_xfrm_policy_lookup(pol->security,
fl->flowi_secid,
policy_to_flow_dir(dir));
- if (!err)
- xfrm_pol_hold(pol);
- else if (err == -ESRCH)
+ if (!err) {
+ if (!xfrm_pol_hold_rcu(pol))
+ goto again;
+ } else if (err == -ESRCH) {
pol = NULL;
- else
+ } else {
pol = ERR_PTR(err);
+ }
} else
pol = NULL;
}
out:
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
rcu_read_unlock();
return pol;
}
@@ -1275,7 +1303,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
/* Socket policies are not hashed. */
if (!hlist_unhashed(&pol->bydst)) {
- hlist_del(&pol->bydst);
+ hlist_del_rcu(&pol->bydst);
hlist_del(&pol->byidx);
}
@@ -1299,9 +1327,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
struct net *net = xp_net(pol);
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
pol = __xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (pol) {
xfrm_policy_kill(pol);
return 0;
@@ -1320,7 +1348,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
return -EINVAL;
#endif
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
old_pol = rcu_dereference_protected(sk->sk_policy[dir],
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
@@ -1338,7 +1366,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
*/
xfrm_sk_policy_unlink(old_pol, dir);
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (old_pol) {
xfrm_policy_kill(old_pol);
@@ -1368,9 +1396,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
newp->type = old->type;
memcpy(newp->xfrm_vec, old->xfrm_vec,
newp->xfrm_nr*sizeof(struct xfrm_tmpl));
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_sk_policy_link(newp, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_pol_put(newp);
}
return newp;
@@ -3052,7 +3080,7 @@ static int __net_init xfrm_net_init(struct net *net)
/* Initialize the per-net locks here */
spin_lock_init(&net->xfrm.xfrm_state_lock);
- rwlock_init(&net->xfrm.xfrm_policy_lock);
+ spin_lock_init(&net->xfrm.xfrm_policy_lock);
mutex_init(&net->xfrm.xfrm_cfg_mutex);
return 0;
@@ -3085,7 +3113,9 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
void __init xfrm_init(void)
{
+ flow_cache_hp_init();
register_pernet_subsys(&xfrm_net_ops);
+ seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
}
@@ -3183,7 +3213,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
struct hlist_head *chain;
u32 priority = ~0U;
- read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
hlist_for_each_entry(pol, chain, bydst) {
if (xfrm_migrate_selector_match(sel, &pol->selector) &&
@@ -3207,7 +3237,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
xfrm_pol_hold(ret);
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return ret;
}
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 9c4fbd8935f4..ba2b539879bc 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -50,12 +50,18 @@ static const struct snmp_mib xfrm_mib_list[] = {
static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
{
+ unsigned long buff[LINUX_MIB_XFRMMAX];
struct net *net = seq->private;
int i;
+
+ memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX);
+
+ snmp_get_cpu_field_batch(buff, xfrm_mib_list,
+ net->mib.xfrm_statistics);
for (i = 0; xfrm_mib_list[i].name; i++)
seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
- snmp_fold_field(net->mib.xfrm_statistics,
- xfrm_mib_list[i].entry));
+ buff[i]);
+
return 0;
}
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 4fd725a0c500..cdc2e2e71bff 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
x->repl->notify(x, XFRM_REPLAY_UPDATE);
}
-static struct xfrm_replay xfrm_replay_legacy = {
+static const struct xfrm_replay xfrm_replay_legacy = {
.advance = xfrm_replay_advance,
.check = xfrm_replay_check,
.recheck = xfrm_replay_check,
@@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = {
.overflow = xfrm_replay_overflow,
};
-static struct xfrm_replay xfrm_replay_bmp = {
+static const struct xfrm_replay xfrm_replay_bmp = {
.advance = xfrm_replay_advance_bmp,
.check = xfrm_replay_check_bmp,
.recheck = xfrm_replay_check_bmp,
@@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = {
.overflow = xfrm_replay_overflow_bmp,
};
-static struct xfrm_replay xfrm_replay_esn = {
+static const struct xfrm_replay xfrm_replay_esn = {
.advance = xfrm_replay_advance_esn,
.check = xfrm_replay_check_esn,
.recheck = xfrm_replay_recheck_esn,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index a30f898dc1c5..64e3c82eedf6 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -20,7 +20,7 @@
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/audit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
@@ -28,6 +28,11 @@
#include "xfrm_hash.h"
+#define xfrm_state_deref_prot(table, net) \
+ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
+
+static void xfrm_state_gc_task(struct work_struct *work);
+
/* Each xfrm_state may be linked to two tables:
1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -36,6 +41,15 @@
*/
static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
+
+static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
+static HLIST_HEAD(xfrm_state_gc_list);
+
+static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
+{
+ return atomic_inc_not_zero(&x->refcnt);
+}
static inline unsigned int xfrm_dst_hash(struct net *net,
const xfrm_address_t *daddr,
@@ -76,18 +90,18 @@ static void xfrm_hash_transfer(struct hlist_head *list,
h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
x->props.reqid, x->props.family,
nhashmask);
- hlist_add_head(&x->bydst, ndsttable+h);
+ hlist_add_head_rcu(&x->bydst, ndsttable + h);
h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
x->props.family,
nhashmask);
- hlist_add_head(&x->bysrc, nsrctable+h);
+ hlist_add_head_rcu(&x->bysrc, nsrctable + h);
if (x->id.spi) {
h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
x->id.proto, x->props.family,
nhashmask);
- hlist_add_head(&x->byspi, nspitable+h);
+ hlist_add_head_rcu(&x->byspi, nspitable + h);
}
}
}
@@ -122,25 +136,29 @@ static void xfrm_hash_resize(struct work_struct *work)
}
spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ write_seqcount_begin(&xfrm_state_hash_generation);
nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+ odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net);
for (i = net->xfrm.state_hmask; i >= 0; i--)
- xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
- nhashmask);
+ xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask);
- odst = net->xfrm.state_bydst;
- osrc = net->xfrm.state_bysrc;
- ospi = net->xfrm.state_byspi;
+ osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net);
+ ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net);
ohashmask = net->xfrm.state_hmask;
- net->xfrm.state_bydst = ndst;
- net->xfrm.state_bysrc = nsrc;
- net->xfrm.state_byspi = nspi;
+ rcu_assign_pointer(net->xfrm.state_bydst, ndst);
+ rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);
+ rcu_assign_pointer(net->xfrm.state_byspi, nspi);
net->xfrm.state_hmask = nhashmask;
+ write_seqcount_end(&xfrm_state_hash_generation);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
osize = (ohashmask + 1) * sizeof(struct hlist_head);
+
+ synchronize_rcu();
+
xfrm_hash_free(odst, osize);
xfrm_hash_free(osrc, osize);
xfrm_hash_free(ospi, osize);
@@ -356,27 +374,20 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
static void xfrm_state_gc_task(struct work_struct *work)
{
- struct net *net = container_of(work, struct net, xfrm.state_gc_work);
struct xfrm_state *x;
struct hlist_node *tmp;
struct hlist_head gc_list;
spin_lock_bh(&xfrm_state_gc_lock);
- hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
+ hlist_move_list(&xfrm_state_gc_list, &gc_list);
spin_unlock_bh(&xfrm_state_gc_lock);
+ synchronize_rcu();
+
hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
xfrm_state_gc_destroy(x);
}
-static inline unsigned long make_jiffies(long secs)
-{
- if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
- return MAX_SCHEDULE_TIMEOUT-1;
- else
- return secs*HZ;
-}
-
static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
{
struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
@@ -501,14 +512,12 @@ EXPORT_SYMBOL(xfrm_state_alloc);
void __xfrm_state_destroy(struct xfrm_state *x)
{
- struct net *net = xs_net(x);
-
WARN_ON(x->km.state != XFRM_STATE_DEAD);
spin_lock_bh(&xfrm_state_gc_lock);
- hlist_add_head(&x->gclist, &net->xfrm.state_gc_list);
+ hlist_add_head(&x->gclist, &xfrm_state_gc_list);
spin_unlock_bh(&xfrm_state_gc_lock);
- schedule_work(&net->xfrm.state_gc_work);
+ schedule_work(&xfrm_state_gc_work);
}
EXPORT_SYMBOL(__xfrm_state_destroy);
@@ -521,10 +530,10 @@ int __xfrm_state_delete(struct xfrm_state *x)
x->km.state = XFRM_STATE_DEAD;
spin_lock(&net->xfrm.xfrm_state_lock);
list_del(&x->km.all);
- hlist_del(&x->bydst);
- hlist_del(&x->bysrc);
+ hlist_del_rcu(&x->bydst);
+ hlist_del_rcu(&x->bysrc);
if (x->id.spi)
- hlist_del(&x->byspi);
+ hlist_del_rcu(&x->byspi);
net->xfrm.state_num--;
spin_unlock(&net->xfrm.xfrm_state_lock);
@@ -660,7 +669,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
struct xfrm_state *x;
- hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) {
if (x->props.family != family ||
x->id.spi != spi ||
x->id.proto != proto ||
@@ -669,7 +678,8 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
if ((mark & x->mark.m) != x->mark.v)
continue;
- xfrm_state_hold(x);
+ if (!xfrm_state_hold_rcu(x))
+ continue;
return x;
}
@@ -684,7 +694,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
struct xfrm_state *x;
- hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) {
if (x->props.family != family ||
x->id.proto != proto ||
!xfrm_addr_equal(&x->id.daddr, daddr, family) ||
@@ -693,7 +703,8 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
if ((mark & x->mark.m) != x->mark.v)
continue;
- xfrm_state_hold(x);
+ if (!xfrm_state_hold_rcu(x))
+ continue;
return x;
}
@@ -776,13 +787,16 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
struct xfrm_state *best = NULL;
u32 mark = pol->mark.v & pol->mark.m;
unsigned short encap_family = tmpl->encap_family;
+ unsigned int sequence;
struct km_event c;
to_put = NULL;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ sequence = read_seqcount_begin(&xfrm_state_hash_generation);
+
+ rcu_read_lock();
h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
- hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
@@ -798,7 +812,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
goto found;
h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
- hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) {
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
@@ -851,19 +865,21 @@ found:
}
if (km_query(x, tmpl, pol) == 0) {
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
x->km.state = XFRM_STATE_ACQ;
list_add(&x->km.all, &net->xfrm.state_all);
- hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
h = xfrm_src_hash(net, daddr, saddr, encap_family);
- hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
if (x->id.spi) {
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
- hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
}
x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
net->xfrm.state_num++;
xfrm_hash_grow_check(net, x->bydst.next != NULL);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
} else {
x->km.state = XFRM_STATE_DEAD;
to_put = x;
@@ -872,13 +888,26 @@ found:
}
}
out:
- if (x)
- xfrm_state_hold(x);
- else
+ if (x) {
+ if (!xfrm_state_hold_rcu(x)) {
+ *err = -EAGAIN;
+ x = NULL;
+ }
+ } else {
*err = acquire_in_progress ? -EAGAIN : error;
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ }
+ rcu_read_unlock();
if (to_put)
xfrm_state_put(to_put);
+
+ if (read_seqcount_retry(&xfrm_state_hash_generation, sequence)) {
+ *err = -EAGAIN;
+ if (x) {
+ xfrm_state_put(x);
+ x = NULL;
+ }
+ }
+
return x;
}
@@ -946,16 +975,16 @@ static void __xfrm_state_insert(struct xfrm_state *x)
h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
x->props.reqid, x->props.family);
- hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
- hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
if (x->id.spi) {
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
x->props.family);
- hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
}
tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
@@ -1064,9 +1093,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
xfrm_state_hold(x);
tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
list_add(&x->km.all, &net->xfrm.state_all);
- hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
h = xfrm_src_hash(net, daddr, saddr, family);
- hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
net->xfrm.state_num++;
@@ -1375,7 +1404,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {
x->km.state = XFRM_STATE_EXPIRED;
- tasklet_hrtimer_start(&x->mtimer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
return -EINVAL;
}
@@ -1395,9 +1424,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32
{
struct xfrm_state *x;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_lock();
x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_unlock();
return x;
}
EXPORT_SYMBOL(xfrm_state_lookup);
@@ -1582,7 +1611,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
if (x->id.spi) {
spin_lock_bh(&net->xfrm.xfrm_state_lock);
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
- hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
err = 0;
@@ -2100,8 +2129,6 @@ int __net_init xfrm_state_init(struct net *net)
net->xfrm.state_num = 0;
INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
- INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
- INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
spin_lock_init(&net->xfrm.xfrm_state_lock);
return 0;
@@ -2119,7 +2146,7 @@ void xfrm_state_fini(struct net *net)
flush_work(&net->xfrm.state_hash_work);
xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
- flush_work(&net->xfrm.state_gc_work);
+ flush_work(&xfrm_state_gc_work);
WARN_ON(!list_empty(&net->xfrm.state_all));
diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
index 05a6e3d9c258..35a7e794ad04 100644
--- a/net/xfrm/xfrm_sysctl.c
+++ b/net/xfrm/xfrm_sysctl.c
@@ -17,13 +17,13 @@ static struct ctl_table xfrm_table[] = {
.procname = "xfrm_aevent_etime",
.maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_douintvec
},
{
.procname = "xfrm_aevent_rseqth",
.maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_douintvec
},
{
.procname = "xfrm_larval_drop",
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 08892091cfe3..9705c279494b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -27,7 +27,7 @@
#include <net/xfrm.h>
#include <net/netlink.h>
#include <net/ah.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#endif
@@ -2450,7 +2450,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
#endif
type = nlh->nlmsg_type;