diff options
Diffstat (limited to 'net')
153 files changed, 1871 insertions, 1274 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 175273f38cb1..44ebd5c2cd4a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -169,6 +169,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; + vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 6f142f03716d..019efb79708f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -524,6 +524,11 @@ static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); } +static int vlan_dev_get_lock_subclass(struct net_device *dev) +{ + return vlan_dev_priv(dev)->nest_level; +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .rebuild = vlan_dev_rebuild_header, @@ -559,7 +564,6 @@ static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - int subclass = 0; netif_carrier_off(dev); @@ -608,10 +612,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - if (is_vlan_dev(real_dev)) - subclass = 1; - - vlan_dev_set_lockdep_class(dev, subclass); + vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan_dev_priv(dev)->vlan_pcpu_stats) @@ -783,6 +784,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, }; void vlan_setup(struct net_device *dev) diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 8c93267ce969..c4e09846d1de 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -252,7 +252,7 @@ static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size) * we need to ensure there's a memory barrier after it. The bit * *must* be set before we do the atomic_inc() on pvcc->inflight. * There's no smp_mb__after_set_bit(), so it's this or abuse - * smp_mb__after_clear_bit(). + * smp_mb__after_atomic(). */ test_and_set_bit(BLOCKED, &pvcc->blocked); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index b3bd4ec3fd94..f04224c32005 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1545,6 +1545,8 @@ out_neigh: if ((orig_neigh_node) && (!is_single_hop_neigh)) batadv_orig_node_free_ref(orig_neigh_node); out: + if (router_ifinfo) + batadv_neigh_ifinfo_free_ref(router_ifinfo); if (router) batadv_neigh_node_free_ref(router); if (router_router) diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b25fd64d727b..aa5d4946d0d7 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -940,8 +940,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, * additional DAT answer may trigger kernel warnings about * a packet coming from the wrong port. */ - if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, - BATADV_NO_FLAGS)) { + if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, vid)) { ret = true; goto out; } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index bcc4bea632fa..f14e54a05691 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -418,12 +418,13 @@ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) { struct batadv_priv *bat_priv; - struct batadv_hard_iface *primary_if; + struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; unsigned mtu = neigh_node->if_incoming->net_dev->mtu; unsigned header_size = sizeof(frag_header); unsigned max_fragment_size, max_packet_size; + bool ret = false; /* To avoid merge and refragmentation at next-hops we never send * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE @@ -483,7 +484,11 @@ bool batadv_frag_send_packet(struct sk_buff *skb, skb->len + ETH_HLEN); batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); - return true; + ret = true; + out_err: - return false; + if (primary_if) + batadv_hardif_free_ref(primary_if); + + return ret; } diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index c835e137423b..90cff585b37d 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -42,8 +42,10 @@ static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node) { - if (atomic_dec_and_test(&gw_node->refcount)) + if (atomic_dec_and_test(&gw_node->refcount)) { + batadv_orig_node_free_ref(gw_node->orig_node); kfree_rcu(gw_node, rcu); + } } static struct batadv_gw_node * @@ -406,9 +408,14 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; + if (!atomic_inc_not_zero(&orig_node->refcount)) + return; + gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); - if (!gw_node) + if (!gw_node) { + batadv_orig_node_free_ref(orig_node); return; + } INIT_HLIST_NODE(&gw_node->list); gw_node->orig_node = orig_node; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index b851cc580853..fbda6b54baff 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -83,7 +83,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; /* no more parents..stop recursion */ - if (net_dev->iflink == net_dev->ifindex) + if (net_dev->iflink == 0 || net_dev->iflink == net_dev->ifindex) return false; /* recurse over the parent device */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 8c7ca811de6e..96b66fd30f96 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -415,7 +415,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -442,7 +442,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -493,7 +493,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_unsnoopables_list, mcast_want_all_unsnoopables_node) { - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index ffd9dfbd9b0e..6a484514cd3e 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -501,12 +501,17 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu) { struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_neigh_node *router; orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu); if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing); + /* this is the last reference to this object */ + router = rcu_dereference_protected(orig_ifinfo->router, true); + if (router) + batadv_neigh_node_free_ref_now(router); kfree(orig_ifinfo); } @@ -702,6 +707,47 @@ free_orig_node: } /** + * batadv_purge_neigh_ifinfo - purge obsolete ifinfo entries from neighbor + * @bat_priv: the bat priv with all the soft interface information + * @neigh: orig node which is to be checked + */ +static void +batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, + struct batadv_neigh_node *neigh) +{ + struct batadv_neigh_ifinfo *neigh_ifinfo; + struct batadv_hard_iface *if_outgoing; + struct hlist_node *node_tmp; + + spin_lock_bh(&neigh->ifinfo_lock); + + /* for all ifinfo objects for this neighinator */ + hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, + &neigh->ifinfo_list, list) { + if_outgoing = neigh_ifinfo->if_outgoing; + + /* always keep the default interface */ + if (if_outgoing == BATADV_IF_DEFAULT) + continue; + + /* don't purge if the interface is not (going) down */ + if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && + (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && + (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + continue; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "neighbor/ifinfo purge: neighbor %pM, iface: %s\n", + neigh->addr, if_outgoing->net_dev->name); + + hlist_del_rcu(&neigh_ifinfo->list); + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + } + + spin_unlock_bh(&neigh->ifinfo_lock); +} + +/** * batadv_purge_orig_ifinfo - purge obsolete ifinfo entries from originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked @@ -800,6 +846,11 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, hlist_del_rcu(&neigh_node->list); batadv_neigh_node_free_ref(neigh_node); + } else { + /* only necessary if not the whole neighbor is to be + * deleted, but some interface has been removed. + */ + batadv_purge_neigh_ifinfo(bat_priv, neigh_node); } } @@ -857,7 +908,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, { struct batadv_neigh_node *best_neigh_node; struct batadv_hard_iface *hard_iface; - bool changed; + bool changed_ifinfo, changed_neigh; if (batadv_has_timed_out(orig_node->last_seen, 2 * BATADV_PURGE_TIMEOUT)) { @@ -867,10 +918,10 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, jiffies_to_msecs(orig_node->last_seen)); return true; } - changed = batadv_purge_orig_ifinfo(bat_priv, orig_node); - changed = changed || batadv_purge_orig_neighbors(bat_priv, orig_node); + changed_ifinfo = batadv_purge_orig_ifinfo(bat_priv, orig_node); + changed_neigh = batadv_purge_orig_neighbors(bat_priv, orig_node); - if (!changed) + if (!changed_ifinfo && !changed_neigh) return false; /* first for NULL ... */ @@ -1028,7 +1079,8 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, hard_iface); out: - batadv_hardif_free_ref(hard_iface); + if (hard_iface) + batadv_hardif_free_ref(hard_iface); return 0; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index d958e2dca52f..521fd4f3985e 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -819,14 +819,17 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested cp; - /* encrypt must be pending if auth is also pending */ - set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); - cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); + + /* If we're already encrypted set the REAUTH_PEND flag, + * otherwise set the ENCRYPT_PEND. + */ if (conn->key_type != 0xff) set_bit(HCI_CONN_REAUTH_PEND, &conn->flags); + else + set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); } return 0; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 49774912cb01..682f33a38366 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -45,7 +45,7 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) return; clear_bit(HCI_INQUIRY, &hdev->flags); - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); hci_conn_check_pending(hdev); @@ -1768,7 +1768,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags)) return; - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); if (!test_bit(HCI_MGMT, &hdev->dev_flags)) @@ -3330,6 +3330,12 @@ static void hci_key_refresh_complete_evt(struct hci_dev *hdev, if (!conn) goto unlock; + /* For BR/EDR the necessary steps are taken through the + * auth_complete event. + */ + if (conn->type != LE_LINK) + goto unlock; + if (!ev->status) conn->sec_level = conn->pending_sec_level; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a1e5bb7d06e8..dc4d301d3a72 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7519,9 +7519,9 @@ int __init l2cap_init(void) l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs, NULL, &l2cap_debugfs_fops); - debugfs_create_u16("l2cap_le_max_credits", 0466, bt_debugfs, + debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs, &le_max_credits); - debugfs_create_u16("l2cap_le_default_mps", 0466, bt_debugfs, + debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs, &le_default_mps); bt_6lowpan_init(); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 9203d5a1943f..474d36f93342 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -487,6 +487,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, { struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; + bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -507,10 +508,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name); } else { /* fastpath: update of existing entry */ - fdb->dst = source; + if (unlikely(source != fdb->dst)) { + fdb->dst = source; + fdb_modified = true; + } fdb->updated = jiffies; if (unlikely(added_by_user)) fdb->added_by_user = 1; + if (unlikely(fdb_modified)) + fdb_notify(br, fdb, RTM_NEWNEIGH); } } else { spin_lock(&br->hash_lock); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7985deaff52f..04d6348fd530 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -147,8 +147,8 @@ static int br_handle_local_finish(struct sk_buff *skb) struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; - br_vlan_get_tag(skb, &vid); - if (p->flags & BR_LEARNING) + /* check if vlan is allowed, to avoid spoofing */ + if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); return 0; /* process further */ } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 80e1b0f60a30..2acf7fa1fec6 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -859,12 +859,12 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, return NF_STOLEN; } -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4) +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) static int br_nf_dev_queue_xmit(struct sk_buff *skb) { int ret; - if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && + if (skb->protocol == htons(ETH_P_IP) && skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && !skb_is_gso(skb)) { if (br_parse_ip_options(skb)) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e74b6d530cb6..e8844d975b32 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -445,6 +445,20 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } +static int br_dev_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_bridge *br = netdev_priv(dev); + + if (tb[IFLA_ADDRESS]) { + spin_lock_bh(&br->lock); + br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); + spin_unlock_bh(&br->lock); + } + + return register_netdevice(dev); +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -473,6 +487,7 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, .validate = br_validate, + .newlink = br_dev_newlink, .dellink = br_dev_delete, }; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 06811d79f89f..59d3a85c5873 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -581,6 +581,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, struct sk_buff *skb, u16 *vid); bool br_allowed_egress(struct net_bridge *br, const struct net_port_vlans *v, const struct sk_buff *skb); +bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_port_vlans *v, struct sk_buff *skb); @@ -648,6 +649,12 @@ static inline bool br_allowed_egress(struct net_bridge *br, return true; } +static inline bool br_should_learn(struct net_bridge_port *p, + struct sk_buff *skb, u16 *vid) +{ + return true; +} + static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_port_vlans *v, struct sk_buff *skb) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 4a3716102789..5fee2feaf292 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -241,6 +241,34 @@ bool br_allowed_egress(struct net_bridge *br, return false; } +/* Called under RCU */ +bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) +{ + struct net_bridge *br = p->br; + struct net_port_vlans *v; + + if (!br->vlan_enabled) + return true; + + v = rcu_dereference(p->vlan_info); + if (!v) + return false; + + br_vlan_get_tag(skb, vid); + if (!*vid) { + *vid = br_get_pvid(v); + if (*vid == VLAN_N_VID) + return false; + + return true; + } + + if (test_bit(*vid, v->vlan_bitmap)) + return true; + + return false; +} + /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ diff --git a/net/can/gw.c b/net/can/gw.c index ac31891967da..050a2110d43f 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -804,7 +804,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) u8 limhops = 0; int err = 0; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) @@ -893,7 +893,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) u8 limhops = 0; int err = 0; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dac7f9b98687..1948d592aa54 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -557,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, return r; } -static int ceph_tcp_sendpage(struct socket *sock, struct page *page, +static int __ceph_tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); @@ -570,6 +570,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, return ret; } +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, bool more) +{ + int ret; + struct kvec iov; + + /* sendpage cannot properly handle pages with page_count == 0, + * we need to fallback to sendmsg if that's the case */ + if (page_count(page) >= 1) + return __ceph_tcp_sendpage(sock, page, offset, size, more); + + iov.iov_base = kmap(page) + offset; + iov.iov_len = size; + ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more); + kunmap(page); + + return ret; +} /* * Shutdown/close the socket for the given connection. diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index e632b5a52f5b..c547e46084d3 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -329,6 +329,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_descend_once = %d", c->chooseleaf_descend_once); + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_vary_r = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_vary_r = %d", + c->chooseleaf_vary_r); + done: dout("crush_decode success\n"); return c; @@ -1548,8 +1553,10 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, return; for (i = 0; i < len; i++) { - if (osds[i] != CRUSH_ITEM_NONE && - osdmap->osd_primary_affinity[i] != + int osd = osds[i]; + + if (osd != CRUSH_ITEM_NONE && + osdmap->osd_primary_affinity[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { break; } @@ -1563,10 +1570,9 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, * osd's pgs get rejected as primary. */ for (i = 0; i < len; i++) { - int osd; + int osd = osds[i]; u32 aff; - osd = osds[i]; if (osd == CRUSH_ITEM_NONE) continue; diff --git a/net/core/dev.c b/net/core/dev.c index 14dac0654f28..8908a68db449 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1326,7 +1326,7 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); @@ -2283,8 +2283,8 @@ EXPORT_SYMBOL(skb_checksum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { + unsigned int vlan_depth = skb->mac_len; __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { @@ -2297,15 +2297,30 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; + /* if skb->protocol is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); } *depth = vlan_depth; @@ -2418,7 +2433,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2493,38 +2508,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - const struct net_device *dev, - netdev_features_t features) + netdev_features_t features) { int tmp; if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(dev, skb)) { + } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - netdev_features_t features = dev->features; + netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } - features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) @@ -2532,9 +2545,9 @@ netdev_features_t netif_skb_dev_features(struct sk_buff *skb, NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } -EXPORT_SYMBOL(netif_skb_dev_features); +EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) @@ -3343,7 +3356,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3353,7 +3366,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -3953,6 +3966,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; @@ -4244,7 +4258,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); @@ -4543,6 +4557,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list) EXPORT_SYMBOL(netdev_adjacent_get_private); /** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +/** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position @@ -4624,6 +4664,32 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +/** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant @@ -5073,6 +5139,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_dev_get_private); + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) +{ + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; + + ASSERT_RTNL(); + + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } + + if (type_check(dev)) + max_nest++; + + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -5542,7 +5632,7 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); -static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { diff --git a/net/core/dst.c b/net/core/dst.c index ca4231ec7347..80d6286c8b62 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -142,12 +142,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard(struct sk_buff *skb) +int dst_discard_sk(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_sk); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -184,7 +184,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard; + dst->output = dst_discard_sk; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -209,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst) /* The first case (dev==NULL) is required, when protocol module is unloaded. */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) - dst->input = dst->output = dst_discard; + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_discard_sk; + } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -361,7 +363,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, return; if (!unregister) { - dst->input = dst->output = dst_discard; + dst->input = dst_discard; + dst->output = dst_discard_sk; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); diff --git a/net/core/filter.c b/net/core/filter.c index e08b3822c72a..4aec7b93f1a9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -122,6 +122,13 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return 0; } +/* Register mappings for user programs. */ +#define A_REG 0 +#define X_REG 7 +#define TMP_REG 8 +#define ARG2_REG 2 +#define ARG3_REG 3 + /** * __sk_run_filter - run a filter on a given context * @ctx: buffer to run the filter on @@ -242,6 +249,8 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) regs[FP_REG] = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; regs[ARG1_REG] = (u64) (unsigned long) ctx; + regs[A_REG] = 0; + regs[X_REG] = 0; select_insn: goto *jumptable[insn->code]; @@ -600,6 +609,9 @@ static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5) if (skb_is_nonlinear(skb)) return 0; + if (skb->len < sizeof(struct nlattr)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) return 0; @@ -618,11 +630,14 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5) if (skb_is_nonlinear(skb)) return 0; + if (skb->len < sizeof(struct nlattr)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[A]; - if (nla->nla_len > A - skb->len) + if (nla->nla_len > skb->len - A) return 0; nla = nla_find_nested(nla, X); @@ -637,13 +652,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 A, u64 X, u64 r4, u64 r5) return raw_smp_processor_id(); } -/* Register mappings for user programs. */ -#define A_REG 0 -#define X_REG 7 -#define TMP_REG 8 -#define ARG2_REG 2 -#define ARG3_REG 3 - static bool convert_bpf_extensions(struct sock_filter *fp, struct sock_filter_int **insnp) { @@ -1551,8 +1559,13 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, fp->jited = 0; err = sk_chk_filter(fp->insns, fp->len); - if (err) + if (err) { + if (sk != NULL) + sk_filter_uncharge(sk, fp); + else + kfree(fp); return ERR_PTR(err); + } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. @@ -1737,7 +1750,6 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS, diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839322ba..bd0767e6b2b3 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev) * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8f8a96ef9f3f..32d872eec7f5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1248,8 +1248,8 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; - neigh->nud_state = NUD_PROBE; - atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); + neigh->nud_state = NUD_INCOMPLETE; + atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 81d3a9a08453..7c8ffd974961 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -24,7 +24,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 22931e1b99b4..30d903b19c62 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -42,7 +42,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_cls_state *cs = css_cls_state(css); - struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); + struct cgroup_cls_state *parent = css_cls_state(css->parent); if (parent) cs->classid = parent->classid; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3825f669147b..2f385b9bccc0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *parent_css = css_parent(css); + struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; @@ -185,15 +185,15 @@ static int read_priomap(struct seq_file *sf, void *v) return 0; } -static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t write_priomap(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; - if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); @@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(css, dev, prio); + ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); - return ret; + return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) @@ -239,7 +239,7 @@ static struct cftype ss_files[] = { { .name = "ifpriomap", .seq_show = read_priomap, - .write_string = write_priomap, + .write = write_priomap, }, { } /* terminate */ }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d4ff41739b0f..2d8d8fcfa060 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -353,15 +353,46 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + for_each_net(net) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - rtnl_lock(); + /* Close the race with cleanup_net() */ + mutex_lock(&net_mutex); + rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); + mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); @@ -774,7 +805,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, return 0; } -static size_t rtnl_port_size(const struct net_device *dev) +static size_t rtnl_port_size(const struct net_device *dev, + u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ @@ -790,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev) size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + @@ -826,7 +859,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ - + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ @@ -888,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) return 0; } -static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, + u32 ext_filter_mask) { int err; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); @@ -1079,7 +1114,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_nest_end(skb, vfinfo); } - if (rtnl_port_fill(skb, dev)) + if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { @@ -1198,6 +1233,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; + int err; s_h = cb->args[0]; s_idx = cb->args[1]; @@ -1218,11 +1254,17 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, - ext_filter_mask) <= 0) + err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI, + ext_filter_mask); + /* If we ran out of room on the first message, + * we're in trouble + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err <= 0) goto out; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -1395,7 +1437,8 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, + struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1407,7 +1450,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, err = PTR_ERR(net); goto errout; } - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { err = -EPERM; goto errout; } @@ -1661,7 +1704,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; - err = do_setlink(dev, ifm, tb, ifname, 0); + err = do_setlink(skb, dev, ifm, tb, ifname, 0); errout: return err; } @@ -1778,7 +1821,8 @@ err: } EXPORT_SYMBOL(rtnl_create_link); -static int rtnl_group_changelink(struct net *net, int group, +static int rtnl_group_changelink(const struct sk_buff *skb, + struct net *net, int group, struct ifinfomsg *ifm, struct nlattr **tb) { @@ -1787,7 +1831,7 @@ static int rtnl_group_changelink(struct net *net, int group, for_each_netdev(net, dev) { if (dev->group == group) { - err = do_setlink(dev, ifm, tb, NULL, 0); + err = do_setlink(skb, dev, ifm, tb, NULL, 0); if (err < 0) return err; } @@ -1929,12 +1973,12 @@ replay: modified = 1; } - return do_setlink(dev, ifm, tb, ifname, modified); + return do_setlink(skb, dev, ifm, tb, ifname, modified); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(net, + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, tb); return -ENODEV; @@ -2321,7 +2365,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) int err = -EINVAL; __u8 *addr; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); @@ -2773,7 +2817,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sz_idx = type>>2; kind = type&3; - if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1b62343f5837..8383b2bddeb9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3076,7 +3076,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (unlikely(p->len + len >= 65536)) return -E2BIG; - lp = NAPI_GRO_CB(p)->last ?: p; + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { @@ -3192,7 +3192,7 @@ merge: __skb_pull(skb, offset); - if (!NAPI_GRO_CB(p)->last) + if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; diff --git a/net/core/sock.c b/net/core/sock.c index b4fff008136f..664ee4295b6f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,55 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + + #ifdef CONFIG_MEMCG_KMEM int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index d7af18859322..a4216a4c9572 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -49,7 +49,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); -int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, struct sk_buff *skb, int attrtype) { struct sock_fprog_kern *fprog; @@ -58,7 +58,7 @@ int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, unsigned int flen; int err = 0; - if (!ns_capable(user_ns, CAP_NET_ADMIN)) { + if (!may_report_filterinfo) { nla_reserve(skb, attrtype, 0); return 0; } diff --git a/net/core/utils.c b/net/core/utils.c index 2f737bf90b3f..eed34338736c 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -348,8 +348,8 @@ static void __net_random_once_deferred(struct work_struct *w) { struct __net_random_once_work *work = container_of(w, struct __net_random_once_work, work); - if (!static_key_enabled(work->key)) - static_key_slow_inc(work->key); + BUG_ON(!static_key_enabled(work->key)); + static_key_slow_dec(work->key); kfree(work); } @@ -367,7 +367,7 @@ static void __net_random_once_disable_jump(struct static_key *key) } bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *done_key) + struct static_key *once_key) { static DEFINE_SPINLOCK(lock); unsigned long flags; @@ -382,7 +382,7 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, *done = true; spin_unlock_irqrestore(&lock, flags); - __net_random_once_disable_jump(done_key); + __net_random_once_disable_jump(once_key); return true; } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 553644402670..f8b98d89c285 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1669,7 +1669,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; - if ((nlh->nlmsg_type == RTM_SETDCB) && !capable(CAP_NET_ADMIN)) + if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, diff --git a/net/dccp/output.c b/net/dccp/output.c index 8876078859da..0248e8a3460c 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -138,7 +138,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); return net_xmit_eval(err); } return -ENOBUFS; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index a603823a3e27..3b726f31c64c 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -574,7 +574,7 @@ static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) struct dn_ifaddr __rcu **ifap; int err = -EINVAL; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) @@ -618,7 +618,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) struct dn_ifaddr *ifa; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 57dc159245ec..d332aefb0846 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -505,7 +505,7 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *attrs[RTA_MAX+1]; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) @@ -530,7 +530,7 @@ static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *attrs[RTA_MAX+1]; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!net_eq(net, &init_net)) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index ce0cbbfe0f43..daccc4a36d80 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -752,7 +752,7 @@ static int dn_to_neigh_output(struct sk_buff *skb) return n->output(n, skb); } -static int dn_output(struct sk_buff *skb) +static int dn_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -838,6 +838,18 @@ drop: * Used to catch bugs. This should never normally get * called. */ +static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n", + le16_to_cpu(cb->src), le16_to_cpu(cb->dst)); + + kfree_skb(skb); + + return NET_RX_DROP; +} + static int dn_rt_bug(struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1463,7 +1475,7 @@ make_route: rt->n = neigh; rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug; + rt->dst.output = dn_rt_bug_sk; switch (res.type) { case RTN_UNICAST: rt->dst.input = dn_forward; diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index e83015cecfa7..e4d9560a910b 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0eb5d5e76dfb..5db37cef50a9 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -406,8 +406,9 @@ static int dsa_of_probe(struct platform_device *pdev) goto out_free; } - chip_index = 0; + chip_index = -1; for_each_available_child_of_node(np, child) { + chip_index++; cd = &pd->chip[chip_index]; cd->mii_bus = &mdio_bus->dev; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8c54870db792..6d6dd345bc4d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1650,6 +1650,39 @@ static int __init init_ipv4_mibs(void) return register_pernet_subsys(&ipv4_mib_ops); } +static __net_init int inet_init_net(struct net *net) +{ + /* + * Set defaults for local port range + */ + seqlock_init(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = 32768; + net->ipv4.ip_local_ports.range[1] = 61000; + + seqlock_init(&net->ipv4.ping_group_range.lock); + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); + net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); + return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { + .init = inet_init_net, + .exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ + return register_pernet_subsys(&af_inet_ops); +} + static int ipv4_proc_init(void); /* @@ -1794,6 +1827,9 @@ static int __init inet_init(void) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif + + if (init_inet_pernet_ops()) + pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); /* * Initialise per-cpu ipv4 mibs */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1a629f870274..255aa9946fe7 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -250,7 +250,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif; + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b53f0bf84dca..b10cd43a4722 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -631,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, .flowi4_oif = nh->nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ @@ -820,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); if (!fi->fib_metrics) goto failure; } else fi->fib_metrics = (u32 *) dst_default_metrics; - fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0d1e2cb877ec..a56b8e6e866a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,11 +37,11 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - *low = net->ipv4.sysctl_local_ports.range[0]; - *high = net->ipv4.sysctl_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + *low = net->ipv4.ip_local_ports.range[0]; + *high = net->ipv4.ip_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f424465112..56cd458a1b8c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -522,7 +522,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index be8abe73bb9f..6f111e48e11c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -42,12 +42,12 @@ static bool ip_may_fragment(const struct sk_buff *skb) { return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - !skb->local_df; + skb->local_df; } static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { - if (skb->len <= mtu || skb->local_df) + if (skb->len <= mtu) return false; if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) @@ -56,53 +56,6 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } -static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) -{ - unsigned int mtu; - - if (skb->local_df || !skb_is_gso(skb)) - return false; - - mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true); - - /* if seglen > mtu, do software segmentation for IP fragmentation on - * output. DF bit cannot be set since ip_forward would have sent - * icmp error. - */ - return skb_gso_network_seglen(skb) > mtu; -} - -/* called if GSO skb needs to be fragmented on forward */ -static int ip_forward_finish_gso(struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - netdev_features_t features; - struct sk_buff *segs; - int ret = 0; - - features = netif_skb_dev_features(skb, dst->dev); - segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR(segs)) { - kfree_skb(skb); - return -ENOMEM; - } - - consume_skb(skb); - - do { - struct sk_buff *nskb = segs->next; - int err; - - segs->next = NULL; - err = dst_output(segs); - - if (err && ret == 0) - ret = err; - segs = nskb; - } while (segs); - - return ret; -} static int ip_forward_finish(struct sk_buff *skb) { @@ -114,9 +67,6 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); - if (ip_gso_exceeds_dst_mtu(skb)) - return ip_forward_finish_gso(skb); - return dst_output(skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index c10a3ce5cbff..ed32313e307c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -232,8 +232,9 @@ static void ip_expire(unsigned long arg) * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || - (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL)) + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1a0755fea491..a52f50187b54 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } +static int ip_finish_output_gso(struct sk_buff *skb) +{ + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); + + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) sizeof(fl4->saddr) + sizeof(fl4->daddr)); } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -389,6 +433,7 @@ packet_routed: ip_select_ident_more(skb, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e77381d1df9a..2acc2337d38b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -442,6 +442,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } + skb_reset_network_header(skb); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -538,9 +540,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; - bool connected = true; + bool connected; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + connected = (tunnel->parms.iph.daddr != 0); dst = tnl_params->daddr; if (dst == 0) { @@ -670,7 +673,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, return; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -722,19 +725,18 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; + struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: - t = NULL; - if (dev == itn->fb_tunnel_dev) + if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (t == NULL) - t = netdev_priv(dev); + if (t == NULL) + t = netdev_priv(dev); + } memcpy(p, &t->parms, sizeof(*p)); break; @@ -881,6 +883,7 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); } rtnl_unlock(); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index e0c2b1d2ea4e..bcf206c79005 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,7 +46,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { @@ -76,7 +76,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, iph->ttl = ttl; __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - err = ip_local_out(skb); + err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index afcee51b90ed..13ef00f1e17b 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -239,6 +239,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; + __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; @@ -254,6 +255,8 @@ static int vti4_err(struct sk_buff *skb, u32 info) if (!tunnel) return -1; + mark = be32_to_cpu(tunnel->parms.o_key); + switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); @@ -281,7 +284,7 @@ static int vti4_err(struct sk_buff *skb, u32 info) return 0; } - x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 28863570dd60..d84dc8d4c916 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -455,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_iif = skb->skb_iif, + .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c49dcd0284a0..4bfaedf9b34e 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -89,11 +89,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ipv4_is_multicast(iph->daddr)) { if (ipv4_is_zeronet(iph->saddr)) return ipv4_is_local_multicast(iph->daddr) ^ invert; - flow.flowi4_iif = 0; - } else { - flow.flowi4_iif = LOOPBACK_IFINDEX; } - + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 12e13bd82b5b..f40f321b41fc 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) err = ip_defrag(skb, user); local_bh_enable(); - if (!err) + if (!err) { ip_send_check(ip_hdr(skb)); + skb->local_df = 1; + } return err; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index f4b19e5dde54..044a0ddf6a79 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -236,15 +236,15 @@ exit: static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { - kgid_t *data = net->ipv4.sysctl_ping_group_range; + kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } @@ -252,26 +252,33 @@ int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); - struct group_info *group_info = get_current_groups(); - int i, j, count = group_info->ngroups; + struct group_info *group_info; + int i, j, count; kgid_t low, high; + int ret = 0; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; + group_info = get_current_groups(); + count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { kgid_t gid = group_info->blocks[i][j]; if (gid_lte(low, gid) && gid_lte(gid, high)) - return 0; + goto out_release_group; } count -= cp_count; } - return -EACCES; + ret = -EACCES; + +out_release_group: + put_group_info(group_info); + return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34d094cadb11..5e676be3daeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1129,7 +1129,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1519,7 +1519,7 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *out_dev; unsigned int flags = 0; bool do_cache; - u32 itag; + u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); @@ -1700,8 +1700,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, - LOOPBACK_IFINDEX, - dev, in_dev, &itag); + 0, dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; goto local_input; @@ -2218,7 +2217,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; new->dev = ort->dst.dev; if (new->dev) @@ -2357,7 +2356,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } else #endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) goto nla_put_failure; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 44eba052b43d..5cde8f263d40 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,10 +45,10 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = range[0]; - net->ipv4.sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_seqlock(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = range[0]; + net->ipv4.ip_local_ports.range[1] = range[1]; + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -57,7 +57,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos) { struct net *net = - container_of(table->data, struct net, ipv4.sysctl_local_ports.range); + container_of(table->data, struct net, ipv4.ip_local_ports.range); int ret; int range[2]; struct ctl_table tmp = { @@ -87,14 +87,14 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); + container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } /* Update system visible IP port range */ @@ -102,11 +102,11 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); - write_seqlock(&net->ipv4.sysctl_local_ports.lock); + container_of(table->data, struct net, ipv4.ping_group_range.range); + write_seqlock(&net->ipv4.ip_local_ports.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -805,7 +805,7 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ping_group_range", - .data = &init_net.ipv4.sysctl_ping_group_range, + .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, @@ -819,8 +819,8 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range), - .data = &init_net.ipv4.sysctl_local_ports.range, + .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), + .data = &init_net.ipv4.ip_local_ports.range, .mode = 0644, .proc_handler = ipv4_local_port_range, }, @@ -858,20 +858,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[i].data += (void *)net - (void *)&init_net; } - /* - * Sane defaults - nobody may create ping sockets. - * Boot scripts should set this to distro-specific group. - */ - net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); - net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); - - /* - * Set defaults for local port range - */ - seqlock_init(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = 32768; - net->ipv4.sysctl_local_ports.range[1] = 61000; - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 8bf224516ba2..b4f1b29b08bd 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -409,7 +409,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; ratio += cnt; - ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); + ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6b46eb2f94c..3a26b3b23f16 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2684,13 +2684,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) bool recovered = !before(tp->snd_una, tp->high_seq); if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - if (flag & FLAG_ORIG_SACK_ACKED) { - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - tcp_try_undo_loss(sk, true); + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) return; - } + if (after(tp->snd_nxt, tp->high_seq) && (flag & FLAG_DATA_SACKED || is_dupack)) { tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index d4f015ad6c84..f7a2ec3ac584 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -102,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned long long val; int ret = 0; - switch (cft->private) { + buf = strstrip(buf); + + switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = tcp_update_limit(memcg, val); @@ -121,7 +123,7 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) @@ -168,17 +170,18 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_css(css); + memcg = mem_cgroup_from_css(of_css(of)); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; + return nbytes; - switch (event) { + switch (of_cft(of)->private) { case RES_MAX_USAGE: res_counter_reset_max(&cg_proto->memory_allocated); break; @@ -187,13 +190,13 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", - .write_string = tcp_cgroup_write, + .write = tcp_cgroup_write, .read_u64 = tcp_cgroup_read, .private = RES_LIMIT, }, @@ -205,13 +208,13 @@ static struct cftype tcp_files[] = { { .name = "kmem.tcp.failcnt", .private = RES_FAILCNT, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { .name = "kmem.tcp.max_usage_in_bytes", .private = RES_MAX_USAGE, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { } /* terminate */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 699fb102e971..2d340bd2cd3d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -981,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -1930,10 +1930,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. - * We abuse smp_mb__after_clear_bit() because - * there is no smp_mb__after_set_bit() yet */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (atomic_read(&sk->sk_wmem_alloc) > limit) break; } @@ -2441,8 +2439,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - if (likely(!err)) + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans++; + } return err; } @@ -2452,12 +2456,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err = __tcp_retransmit_skb(sk, skb); if (err == 0) { - /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans++; - #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index baa0f63731fd..186a8ecf92fa 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - - skb->protocol = htons(ETH_P_IP); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; return x->outer_mode->output2(x, skb); } @@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER - if (!skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + skb->protocol = htons(ETH_P_IP); +#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IP); return xfrm_output(skb); } -int xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER + if (!x) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm4_output(struct sock *sk, struct sk_buff *skb) +{ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, dst->dev, - x->outer_mode->afinfo->output_finish, + NULL, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 7f7b243e8139..a2ce0101eaac 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -50,8 +50,12 @@ int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(protocol); - for_each_protocol_rcu(*proto_handlers(protocol), handler) + if (!head) + return 0; + + for_each_protocol_rcu(*head, handler) if ((ret = handler->cb_handler(skb, err)) <= 0) return ret; @@ -64,15 +68,20 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, { int ret; struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - for_each_protocol_rcu(*proto_handlers(nexthdr), handler) + if (!head) + goto out; + + for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; +out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); @@ -208,6 +217,9 @@ int xfrm4_protocol_register(struct xfrm4_protocol *handler, int ret = -EEXIST; int priority = handler->priority; + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + mutex_lock(&xfrm4_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), @@ -250,6 +262,9 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, struct xfrm4_protocol *t; int ret = -ENOENT; + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + mutex_lock(&xfrm4_protocol_mutex); for (pprev = proto_handlers(protocol); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index c9138189415a..d4ade34ab375 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -224,9 +224,8 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, return dst; } -int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { - struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; struct dst_entry *dst; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 34e0ded5c14b..87891f5f57b5 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1459,7 +1459,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) if (w->skip) { w->skip--; - continue; + goto skip; } err = w->func(w); @@ -1469,6 +1469,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->count++; continue; } +skip: w->state = FWS_U; case FWS_U: if (fn == w->root) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c98338b81d30..9d921462b57f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1559,6 +1559,15 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], return 0; } +static void ip6gre_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + if (dev != ign->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + static size_t ip6gre_get_size(const struct net_device *dev) { return @@ -1636,6 +1645,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .validate = ip6gre_tunnel_validate, .newlink = ip6gre_newlink, .changelink = ip6gre_changelink, + .dellink = ip6gre_dellink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, }; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 59f95affceb0..b2f091566f88 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -196,7 +196,6 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, unsigned int off; u16 flush = 1; int proto; - __wsum csum; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); @@ -264,13 +263,10 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, NAPI_GRO_CB(skb)->flush |= flush; - csum = skb->csum; - skb_postpull_rcsum(skb, iph, skb_network_header_len(skb)); + skb_gro_postpull_rcsum(skb, iph, nlen); pp = ops->callbacks.gro_receive(head, skb); - skb->csum = csum; - out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3284d61577c0..fbf11562b54c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -132,7 +132,7 @@ static int ip6_finish_output(struct sk_buff *skb) return ip6_finish_output2(skb); } -int ip6_output(struct sk_buff *skb) +int ip6_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); @@ -344,12 +344,16 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { - if (skb->len <= mtu || skb->local_df) + if (skb->len <= mtu) return false; + /* ipv6 conntrack defrag sets max_frag_size + local_df */ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) return true; + if (skb->local_df) + return false; + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) return false; @@ -1225,7 +1229,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, unsigned int maxnonfragsize, headersize; headersize = sizeof(struct ipv6hdr) + - (opt ? opt->tot_len : 0) + + (opt ? opt->opt_flen + opt->opt_nflen : 0) + (dst_allfrag(&rt->dst) ? sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e1df691d78be..f6a66bb4114d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1340,8 +1340,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; - struct ip6_tnl *t = NULL; - struct net *net = dev_net(dev); + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); switch (cmd) { @@ -1353,11 +1353,11 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } - if (t == NULL) - t = netdev_priv(dev); ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { err = -EFAULT; @@ -1557,7 +1557,7 @@ static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[]) { u8 proto; - if (!data) + if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index b7c0f827140b..6cc9f9371cc5 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -511,6 +511,7 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __be32 spi; + __u32 mark; struct xfrm_state *x; struct ip6_tnl *t; struct ip_esp_hdr *esph; @@ -524,6 +525,8 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!t) return -1; + mark = be32_to_cpu(t->parms.o_key); + switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); @@ -545,7 +548,7 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, type != NDISC_REDIRECT) return 0; - x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; @@ -1097,7 +1100,6 @@ static int __init vti6_tunnel_init(void) err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); if (err < 0) { - unregister_pernet_device(&vti6_net_ops); pr_err("%s: can't register vti6 protocol\n", __func__); goto out; @@ -1106,7 +1108,6 @@ static int __init vti6_tunnel_init(void) err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); if (err < 0) { xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); - unregister_pernet_device(&vti6_net_ops); pr_err("%s: can't register vti6 protocol\n", __func__); goto out; @@ -1116,7 +1117,6 @@ static int __init vti6_tunnel_init(void) if (err < 0) { xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); - unregister_pernet_device(&vti6_net_ops); pr_err("%s: can't register vti6 protocol\n", __func__); goto out; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8737400af0a0..8250474ab7dc 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -700,7 +700,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct mr6_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, - .flowi6_iif = skb->skb_iif, + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; int err; @@ -1633,7 +1633,7 @@ struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) { struct mr6_table *mrt; struct flowi6 fl6 = { - .flowi6_iif = skb->skb_iif, + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09a22f4f36c9..ca8d4ea48a5d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -851,7 +851,7 @@ out: static void ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); - const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + @@ -944,10 +944,7 @@ static void ndisc_recv_na(struct sk_buff *skb) /* * Change: router to host */ - struct rt6_info *rt; - rt = rt6_get_dflt_router(saddr, dev); - if (rt) - ip6_del_rt(rt); + rt6_clean_tohost(dev_net(dev), saddr); } out: diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 95f3f1da0d7f..d38e6a8d8b9f 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -30,13 +30,15 @@ int ip6_route_me_harder(struct sk_buff *skb) .daddr = iph->daddr, .saddr = iph->saddr, }; + int err; dst = ip6_route_output(net, skb->sk, &fl6); - if (dst->error) { + err = dst->error; + if (err) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); - return dst->error; + return err; } /* Drop old route. */ diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index e0983f3648a6..790e0c6b19e1 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -33,6 +33,7 @@ static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, struct ipv6hdr *iph = ipv6_hdr(skb); bool ret = false; struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_proto = iph->nexthdr, .daddr = iph->saddr, diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 6313abd53c9d..56596ce390a1 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -12,7 +12,7 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) { static atomic_t ipv6_fragmentation_id; struct in6_addr addr; - int old, new; + int ident; #if IS_ENABLED(CONFIG_IPV6) struct inet_peer *peer; @@ -26,15 +26,10 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) return; } #endif - do { - old = atomic_read(&ipv6_fragmentation_id); - new = old + 1; - if (!new) - new = 1; - } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); + ident = atomic_inc_return(&ipv6_fragmentation_id); addr = rt->rt6i_dst.addr; - addr.s6_addr32[0] ^= (__force __be32)new; + addr.s6_addr32[0] ^= (__force __be32)ident; fhdr->identification = htonl(secure_ipv6_id(addr.s6_addr32)); } EXPORT_SYMBOL(ipv6_select_ident); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5015c50a5ba7..6ebdb7b6744c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -84,9 +84,9 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); @@ -290,7 +290,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard, + .output = dst_discard_sk, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -1058,7 +1058,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; if (dst_metrics_read_only(&ort->dst)) new->_metrics = ort->dst._metrics; @@ -1273,6 +1273,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; fl6.daddr = iph->daddr; @@ -1294,6 +1295,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; fl6.daddr = msg->dest; @@ -1338,7 +1340,7 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; mtu = IPV6_MIN_MTU; @@ -1348,7 +1350,8 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) mtu = idev->cnf.mtu6; rcu_read_unlock(); - return mtu; +out: + return min_t(unsigned int, mtu, IP6_MAX_MTU); } static struct dst_entry *icmp6_dst_gc_list; @@ -1576,7 +1579,7 @@ int ip6_route_add(struct fib6_config *cfg) switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard; + rt->dst.output = dst_discard_sk; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -2128,7 +2131,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2139,7 +2142,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); @@ -2231,6 +2234,27 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) fib6_clean_all(net, fib6_remove_prefsrc, &adni); } +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +/* Remove routers and update dst entries when gateway turn into host. */ +static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +{ + struct in6_addr *gateway = (struct in6_addr *)arg; + + if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || + ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + return -1; + } + return 0; +} + +void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) +{ + fib6_clean_all(net, fib6_clean_tohost, gateway); +} + struct arg_dev_net { struct net_device *dev; struct net *net; @@ -2706,6 +2730,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); + if (tb[RTA_MARK]) + fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); + if (iif) { struct net_device *dev; int flags = 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1693c8d885f0..e5a453ca302e 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -974,8 +974,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto out; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, - ttl, df, !net_eq(tunnel->net, dev_net(dev))); + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, + IPPROTO_IPV6, tos, ttl, df, + !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -1126,8 +1127,8 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) int err = 0; struct ip_tunnel_parm p; struct ip_tunnel_prl prl; - struct ip_tunnel *t; - struct net *net = dev_net(dev); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; @@ -1138,16 +1139,15 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: #endif - t = NULL; if (dev == sitn->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } t = ipip6_tunnel_locate(net, &p, 0); + if (t == NULL) + t = netdev_priv(dev); } - if (t == NULL) - t = netdev_priv(dev); err = -EFAULT; if (cmd == SIOCGETTUNNEL) { @@ -1243,9 +1243,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EINVAL; if (dev == sitn->fb_tunnel_dev) goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); break; @@ -1261,9 +1258,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EFAULT; if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; switch (cmd) { case SIOCDELPRL: @@ -1291,8 +1285,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) sizeof(ip6rd))) goto done; - t = netdev_priv(dev); - if (cmd != SIOCDEL6RD) { err = ipip6_tunnel_update_6rd(t, &ip6rd); if (err < 0) diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 0d78132ff18a..8517d3cd1aed 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -42,7 +42,7 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, if (NAPI_GRO_CB(skb)->flush) goto skip_csum; - wsum = skb->csum; + wsum = NAPI_GRO_CB(skb)->csum; switch (skb->ip_summed) { case CHECKSUM_NONE: diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 6cd625e37706..b930d080c66f 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -114,12 +114,6 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif - - skb->protocol = htons(ETH_P_IPV6); skb->local_df = 1; return x->outer_mode->output2(x, skb); @@ -128,11 +122,13 @@ EXPORT_SYMBOL(xfrm6_prepare_output); int xfrm6_output_finish(struct sk_buff *skb) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + skb->protocol = htons(ETH_P_IPV6); + #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IPV6); return xfrm_output(skb); } @@ -142,6 +138,13 @@ static int __xfrm6_output(struct sk_buff *skb) struct xfrm_state *x = dst->xfrm; int mtu; +#ifdef CONFIG_NETFILTER + if (!x) { + IP6CB(skb)->flags |= IP6SKB_REROUTED; + return dst_output(skb); + } +#endif + if (skb->protocol == htons(ETH_P_IPV6)) mtu = ip6_skb_dst_mtu(skb); else @@ -163,8 +166,9 @@ static int __xfrm6_output(struct sk_buff *skb) return x->outer_mode->afinfo->output_finish(skb); } -int xfrm6_output(struct sk_buff *skb) +int xfrm6_output(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, - skb_dst(skb)->dev, __xfrm6_output); + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, + NULL, skb_dst(skb)->dev, __xfrm6_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index 6ab989c486f7..54d13f8dbbae 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -50,6 +50,10 @@ int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; for_each_protocol_rcu(*proto_handlers(protocol), handler) if ((ret = handler->cb_handler(skb, err)) <= 0) @@ -184,10 +188,12 @@ int xfrm6_protocol_register(struct xfrm6_protocol *handler, struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; bool add_netproto = false; - int ret = -EEXIST; int priority = handler->priority; + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + mutex_lock(&xfrm6_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), @@ -230,6 +236,9 @@ int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, struct xfrm6_protocol *t; int ret = -ENOENT; + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + mutex_lock(&xfrm6_protocol_mutex); for (pprev = proto_handlers(protocol); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 01e77b0ae075..8c9d7302c846 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1830,7 +1830,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_lock_irqsave(&list->lock, flags); while (list_skb != (struct sk_buff *)list) { - if (msg->tag != IUCV_SKB_CB(list_skb)->tag) { + if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { this = list_skb; break; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 47f7a5490555..a4e37d7158dc 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1131,10 +1131,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, skb->local_df = 1; #if IS_ENABLED(CONFIG_IPV6) if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped) - error = inet6_csk_xmit(skb, NULL); + error = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif - error = ip_queue_xmit(skb, fl); + error = ip_queue_xmit(tunnel->sock, skb, fl); /* Update stats */ if (error >= 0) { diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0b44d855269c..3397fe6897c0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -487,7 +487,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m xmit: /* Queue the packet to IP for output */ - rc = ip_queue_xmit(skb, &inet->cork.fl); + rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index bd1fd8ea5105..75b5dd2c9267 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -249,7 +249,7 @@ ieee80211_new_chanctx(struct ieee80211_local *local, if (!local->use_chanctx) { local->_oper_chandef = *chandef; - ieee80211_hw_config(local, 0); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { err = drv_add_chanctx(local, ctx); if (err) { @@ -286,7 +286,7 @@ static void ieee80211_free_chanctx(struct ieee80211_local *local, check_single_channel = true; local->hw.conf.radar_enabled = false; - ieee80211_hw_config(local, 0); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { drv_remove_chanctx(local, ctx); } @@ -492,6 +492,13 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, rx_chains_static = max(rx_chains_static, needed_static); rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic); } + + /* Disable SMPS for the monitor interface */ + sdata = rcu_dereference(local->monitor_sdata); + if (sdata && + rcu_access_pointer(sdata->vif.chanctx_conf) == &chanctx->conf) + rx_chains_dynamic = rx_chains_static = local->rx_chains; + rcu_read_unlock(); if (!local->use_chanctx) { diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 222c28b75315..f169b6ee94ee 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -317,6 +317,7 @@ struct ieee80211_roc_work { bool started, abort, hw_begun, notified; bool to_be_freed; + bool on_channel; unsigned long hw_start_time; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index b055f6a55c68..4c1bf61bc778 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -148,6 +148,8 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!rcu_access_pointer(sdata->vif.chanctx_conf)) continue; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; power = min(power, sdata->vif.bss_conf.txpower); } rcu_read_unlock(); @@ -199,7 +201,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; - if (!changed) + if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) return; drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index dee50aefd6e8..27600a9808ba 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3598,18 +3598,24 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) sdata_lock(sdata); - if (ifmgd->auth_data) { + if (ifmgd->auth_data || ifmgd->assoc_data) { + const u8 *bssid = ifmgd->auth_data ? + ifmgd->auth_data->bss->bssid : + ifmgd->assoc_data->bss->bssid; + /* - * If we are trying to authenticate while suspending, cfg80211 - * won't know and won't actually abort those attempts, thus we - * need to do that ourselves. + * If we are trying to authenticate / associate while suspending, + * cfg80211 won't know and won't actually abort those attempts, + * thus we need to do that ourselves. */ - ieee80211_send_deauth_disassoc(sdata, - ifmgd->auth_data->bss->bssid, + ieee80211_send_deauth_disassoc(sdata, bssid, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, false, frame_buf); - ieee80211_destroy_auth_data(sdata, false); + if (ifmgd->assoc_data) + ieee80211_destroy_assoc_data(sdata, false); + if (ifmgd->auth_data) + ieee80211_destroy_auth_data(sdata, false); cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 0c2a29484c07..7a17decd27f9 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -333,7 +333,7 @@ void ieee80211_sw_roc_work(struct work_struct *work) container_of(work, struct ieee80211_roc_work, work.work); struct ieee80211_sub_if_data *sdata = roc->sdata; struct ieee80211_local *local = sdata->local; - bool started; + bool started, on_channel; mutex_lock(&local->mtx); @@ -354,13 +354,26 @@ void ieee80211_sw_roc_work(struct work_struct *work) if (!roc->started) { struct ieee80211_roc_work *dep; - /* start this ROC */ + WARN_ON(local->use_chanctx); + + /* If actually operating on the desired channel (with at least + * 20 MHz channel width) don't stop all the operations but still + * treat it as though the ROC operation started properly, so + * other ROC operations won't interfere with this one. + */ + roc->on_channel = roc->chan == local->_oper_chandef.chan && + local->_oper_chandef.width != NL80211_CHAN_WIDTH_5 && + local->_oper_chandef.width != NL80211_CHAN_WIDTH_10; - /* switch channel etc */ + /* start this ROC */ ieee80211_recalc_idle(local); - local->tmp_channel = roc->chan; - ieee80211_hw_config(local, 0); + if (!roc->on_channel) { + ieee80211_offchannel_stop_vifs(local); + + local->tmp_channel = roc->chan; + ieee80211_hw_config(local, 0); + } /* tell userspace or send frame */ ieee80211_handle_roc_started(roc); @@ -379,9 +392,10 @@ void ieee80211_sw_roc_work(struct work_struct *work) finish: list_del(&roc->list); started = roc->started; + on_channel = roc->on_channel; ieee80211_roc_notify_destroy(roc, !roc->abort); - if (started) { + if (started && !on_channel) { ieee80211_flush_queues(local, NULL); local->tmp_channel = NULL; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 216c45b949e5..2b608b2b70ec 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1231,7 +1231,8 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) && test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { sta->last_rx = jiffies; - if (ieee80211_is_data(hdr->frame_control)) { + if (ieee80211_is_data(hdr->frame_control) && + !is_multicast_ether_addr(hdr->addr1)) { sta->last_rx_rate_idx = status->rate_idx; sta->last_rx_rate_flag = status->flag; sta->last_rx_rate_vht_flag = status->vht_flag; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 137a192e64bc..847d92f6bef6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1148,7 +1148,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) atomic_dec(&ps->num_sta_ps); /* This station just woke up and isn't aware of our SMPS state */ - if (!ieee80211_smps_is_restrictive(sta->known_smps_mode, + if (!ieee80211_vif_is_mesh(&sdata->vif) && + !ieee80211_smps_is_restrictive(sta->known_smps_mode, sdata->smps_mode) && sta->known_smps_mode != sdata->bss->req_smps && sta_info_tx_streams(sta) != 1) { diff --git a/net/mac80211/status.c b/net/mac80211/status.c index e6e574a307c8..60cb7a665976 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -314,10 +314,9 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; - if ((info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) || - (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)) + if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; - else if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) + if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); @@ -618,6 +617,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) sta, true, acked); if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && + (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->last_tx_rate = info->status.rates[rates_idx]; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index a0b0aea76525..cec5b60487a4 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -21,10 +21,10 @@ #define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ __field(bool, p2p) \ - __string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>") + __string(vif_name, sdata->name) #define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ __entry->p2p = sdata->vif.p2p; \ - __assign_str(vif_name, sdata->dev ? sdata->dev->name : sdata->name) + __assign_str(vif_name, sdata->name) #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 275c94f995f7..3c365837e910 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1780,7 +1780,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->mtx); if (sched_scan_stopped) - cfg80211_sched_scan_stopped(local->hw.wiphy); + cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); /* * If this is for hw restart things are still running. diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index e9e36a256165..9265adfdabfc 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -129,9 +129,12 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; - /* A VHT STA must support 40 MHz */ - if (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) - return; + /* + * A VHT STA must support 40 MHz, but if we verify that here + * then we break a few things - some APs (e.g. Netgear R6300v2 + * and others based on the BCM4360 chipset) will unset this + * capability bit when operating in 20 MHz. + */ vht_cap->vht_supported = true; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f26ee46b51f..3d2d2c8108ca 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1392,15 +1392,19 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (ipip) { __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; /* Update the MTU */ if (ic->type == ICMP_DEST_UNREACH && ic->code == ICMP_FRAG_NEEDED) { struct ip_vs_dest *dest = cp->dest; u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; /* Strip outer IP and ICMP, go to IPIP header */ - __skb_pull(skb, ihl + sizeof(_icmph)); + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_ipip; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", @@ -1408,7 +1412,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ipv4_update_pmtu(skb, dev_net(skb->dev), mtu, 0, 0, 0, 0); /* Client uses PMTUD? */ - if (!(cih->frag_off & htons(IP_DF))) + if (!(frag_off & htons(IP_DF))) goto ignore_ipip; /* Prefer the resulting PMTU */ if (dest) { @@ -1427,12 +1431,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* Strip outer IP, ICMP and IPIP, go to IP header of * original request. */ - __skb_pull(skb, offset2); + if (pskb_pull(skb, offset2) == NULL) + goto ignore_ipip; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, - ic->type, ic->code, ntohl(info)); - icmp_send(skb, ic->type, ic->code, info); + type, code, ntohl(info)); + icmp_send(skb, type, code, info); /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6dba48efe01e..1f4f954c4b47 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -914,7 +914,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -1795,6 +1795,7 @@ int nf_conntrack_init_net(struct net *net) int cpu; atomic_set(&net->ct.count, 0); + seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ccc46fa5edbc..58579634427d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1336,6 +1336,9 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) #ifdef CONFIG_NF_NAT_NEEDED int ret; + if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) + return 0; + ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, cda[CTA_NAT_DST]); if (ret < 0) diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7bd03decd36c..825c3e3f8305 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -605,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = { .expect_policy = &pptp_exp_policy, }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ - nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { - .exit = nf_conntrack_pptp_net_exit, -}; - static int __init nf_conntrack_pptp_init(void) { - int rv; - - rv = nf_conntrack_helper_register(&pptp); - if (rv < 0) - return rv; - rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); - if (rv < 0) - nf_conntrack_helper_unregister(&pptp); - return rv; + return nf_conntrack_helper_register(&pptp); } static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9d9c0dade602..d5665739e3b1 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -66,7 +66,7 @@ static inline struct netns_proto_gre *gre_pernet(struct net *net) return net_generic(net, proto_gre_net_id); } -void nf_ct_gre_keymap_flush(struct net *net) +static void nf_ct_gre_keymap_flush(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km, *tmp; @@ -78,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net) } write_unlock_bh(&net_gre->keymap_lock); } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush); static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 90998a6ff8b9..345acfb1720b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,8 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1]) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - u32 mask; + u32 mask = nft_cmp_fast_mask(priv->len); - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - priv->len); if ((data[priv->sreg].data[0] & mask) == priv->data) return; data[NFT_REG_VERDICT].verdict = NFT_BREAK; @@ -67,20 +66,6 @@ struct nft_jumpstack { int rulenum; }; -static inline void -nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt, - struct nft_jumpstack *jumpstack, unsigned int stackptr) -{ - struct nft_stats __percpu *stats; - const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this; - - rcu_read_lock_bh(); - stats = rcu_dereference(nft_base_chain(chain)->stats); - __this_cpu_inc(stats->pkts); - __this_cpu_add(stats->bytes, pkt->skb->len); - rcu_read_unlock_bh(); -} - enum nft_trace { NFT_TRACE_RULE, NFT_TRACE_RETURN, @@ -118,13 +103,14 @@ static void nft_trace_packet(const struct nft_pktinfo *pkt, unsigned int nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { - const struct nft_chain *chain = ops->priv; + const struct nft_chain *chain = ops->priv, *basechain = chain; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_data data[NFT_REG_MAX + 1]; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - int rulenum = 0; + struct nft_stats __percpu *stats; + int rulenum; /* * Cache cursor to avoid problems in case that the cursor is updated * while traversing the ruleset. @@ -132,6 +118,7 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); do_chain: + rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); next_rule: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; @@ -157,8 +144,10 @@ next_rule: switch (data[NFT_REG_VERDICT].verdict) { case NFT_BREAK: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; - /* fall through */ + continue; case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); continue; } break; @@ -184,37 +173,44 @@ next_rule: jumpstack[stackptr].rule = rule; jumpstack[stackptr].rulenum = rulenum; stackptr++; - /* fall through */ + chain = data[NFT_REG_VERDICT].chain; + goto do_chain; case NFT_GOTO: + if (unlikely(pkt->skb->nf_trace)) + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + chain = data[NFT_REG_VERDICT].chain; goto do_chain; case NFT_RETURN: if (unlikely(pkt->skb->nf_trace)) nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); - - /* fall through */ + break; case NFT_CONTINUE: + if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) + nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); break; default: WARN_ON(1); } if (stackptr > 0) { - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); - stackptr--; chain = jumpstack[stackptr].chain; rule = jumpstack[stackptr].rule; rulenum = jumpstack[stackptr].rulenum; goto next_rule; } - nft_chain_stats(chain, pkt, jumpstack, stackptr); if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_POLICY); + nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + + rcu_read_lock_bh(); + stats = rcu_dereference(nft_base_chain(basechain)->stats); + __this_cpu_inc(stats->pkts); + __this_cpu_add(stats->bytes, pkt->skb->len); + rcu_read_unlock_bh(); - return nft_base_chain(chain)->policy; + return nft_base_chain(basechain)->policy; } EXPORT_SYMBOL_GPL(nft_do_chain); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e8138da4c14f..23ef77c60fff 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -256,15 +256,15 @@ replay: #endif { nfnl_unlock(subsys_id); - kfree_skb(nskb); - return netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(nskb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - kfree_skb(nskb); - return netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(skb, nlh, -EOPNOTSUPP); + return kfree_skb(skb); } while (skb->len >= nlmsg_total_size(0)) { @@ -368,14 +368,13 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - struct net *net = sock_net(skb->sk); int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { netlink_ack(skb, nlh, -EPERM); return; } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 954925db414d..e2b3f51c81f1 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -128,7 +128,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, BUG_ON(err < 0); desc.len *= BITS_PER_BYTE; - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - desc.len); + mask = nft_cmp_fast_mask(desc.len); priv->data = data.data[0] & mask; priv->len = desc.len; return 0; diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 6efe4e5a81c6..8fd324116e6f 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -5,23 +5,35 @@ * they serve as the hanging-off data accessed through repl.data[]. */ +/* tbl has the following structure equivalent, but is C99 compliant: + * struct { + * struct type##_replace repl; + * struct type##_standard entries[nhooks]; + * struct type##_error term; + * } *tbl; + */ + #define xt_alloc_initial_table(type, typ2) ({ \ unsigned int hook_mask = info->valid_hooks; \ unsigned int nhooks = hweight32(hook_mask); \ unsigned int bytes = 0, hooknum = 0, i = 0; \ struct { \ struct type##_replace repl; \ - struct type##_standard entries[nhooks]; \ - struct type##_error term; \ - } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ + struct type##_standard entries[]; \ + } *tbl; \ + struct type##_error *term; \ + size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \ + __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \ + tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \ if (tbl == NULL) \ return NULL; \ + term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ - tbl->term = (struct type##_error)typ2##_ERROR_INIT; \ + *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ - sizeof(struct type##_error); \ + sizeof(struct type##_error); \ for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ if (!(hook_mask & 1)) \ continue; \ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 894cda0206bb..f22757a29cd0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1360,7 +1360,74 @@ retry: return err; } -static inline int netlink_capable(const struct socket *sock, unsigned int flag) +/** + * __netlink_ns_capable - General netlink message capability test + * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, + struct user_namespace *user_ns, int cap) +{ + return ((nsp->flags & NETLINK_SKB_DST) || + file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(__netlink_ns_capable); + +/** + * netlink_ns_capable - General netlink message capability test + * @skb: socket buffer holding a netlink command from userspace + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in the user namespace @user_ns. + */ +bool netlink_ns_capable(const struct sk_buff *skb, + struct user_namespace *user_ns, int cap) +{ + return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); +} +EXPORT_SYMBOL(netlink_ns_capable); + +/** + * netlink_capable - Netlink global message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap in all user namespaces. + */ +bool netlink_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, &init_user_ns, cap); +} +EXPORT_SYMBOL(netlink_capable); + +/** + * netlink_net_capable - Netlink network namespace message capability test + * @skb: socket buffer holding a netlink command from userspace + * @cap: The capability to use + * + * Test to see if the opener of the socket we received the message + * from had when the netlink socket was created and the sender of the + * message has has the capability @cap over the network namespace of + * the socket we received the message from. + */ +bool netlink_net_capable(const struct sk_buff *skb, int cap) +{ + return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); +} +EXPORT_SYMBOL(netlink_net_capable); + +static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); @@ -1428,7 +1495,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, /* Only superuser is allowed to listen multicasts */ if (nladdr->nl_groups) { - if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) @@ -1490,7 +1557,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && - !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; if (!nlk->portid) @@ -2096,7 +2163,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { - if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) @@ -2228,6 +2295,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; int err; struct scm_cookie scm; + u32 netlink_skb_flags = 0; if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; @@ -2247,8 +2315,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && - !netlink_capable(sock, NL_CFG_F_NONROOT_SEND)) + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; + netlink_skb_flags |= NETLINK_SKB_DST; } else { dst_portid = nlk->dst_portid; dst_group = nlk->dst_group; @@ -2278,6 +2347,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = siocb->scm->creds; + NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index b1dcdb932a86..a3ba3ca0ff92 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -561,7 +561,7 @@ static int genl_family_rcv_msg(struct genl_family *family, return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - !capable(CAP_NET_ADMIN)) + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index a3d6951602db..ebb6e2442554 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -174,7 +174,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) skb->local_df = 1; - return iptunnel_xmit(rt, skb, fl.saddr, + return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, false); diff --git a/net/packet/diag.c b/net/packet/diag.c index 533ce4ff108a..92f2c7107eec 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -128,6 +128,7 @@ static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb) static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req, + bool may_report_filterinfo, struct user_namespace *user_ns, u32 portid, u32 seq, u32 flags, int sk_ino) { @@ -172,7 +173,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_FILTER) && - sock_diag_put_filterinfo(user_ns, sk, skb, PACKET_DIAG_FILTER)) + sock_diag_put_filterinfo(may_report_filterinfo, sk, skb, + PACKET_DIAG_FILTER)) goto out_nlmsg_trim; return nlmsg_end(skb, nlh); @@ -188,9 +190,11 @@ static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct packet_diag_req *req; struct net *net; struct sock *sk; + bool may_report_filterinfo; net = sock_net(skb->sk); req = nlmsg_data(cb->nlh); + may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN); mutex_lock(&net->packet.sklist_lock); sk_for_each(sk, &net->packet.sklist) { @@ -200,6 +204,7 @@ static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (sk_diag_fill(sk, skb, req, + may_report_filterinfo, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index dc15f4300808..b64151ade6b3 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -70,10 +70,10 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) int err; u8 pnaddr; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); @@ -233,10 +233,10 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh) int err; u8 dst; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index b7ebe23cdedf..d67de453c35a 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -598,7 +598,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -606,7 +606,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, static u64 rds_ib_get_ack(struct rds_ib_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 45033358358e..aa8bf6786008 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -429,7 +429,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -437,7 +437,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, static u64 rds_iw_get_ack(struct rds_iw_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/send.c b/net/rds/send.c index a82fb660ec00..23718160d71e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -107,7 +107,7 @@ static int acquire_in_xmit(struct rds_connection *conn) static void release_in_xmit(struct rds_connection *conn) { clear_bit(RDS_IN_XMIT, &conn->c_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk @@ -661,7 +661,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, /* order flag updates with spin locks */ if (!list_empty(&list)) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&conn->c_lock, flags); @@ -691,7 +691,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) } /* order flag updates with the rs lock */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 81cf5a4c5e40..53b17ca0dff5 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -93,7 +93,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 7633a752c65e..0ad080790a32 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -99,7 +99,7 @@ static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr, _debug("tktlen: %x", tktlen); if (tktlen > AFSTOKEN_RK_TIX_MAX) return -EKEYREJECTED; - if (8 * 4 + tktlen != toklen) + if (toklen < 8 * 4 + tktlen) return -EKEYREJECTED; plen = sizeof(*token) + sizeof(*token->kad) + tktlen; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8a5ba5add4bc..648778aef1a2 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -948,7 +948,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; - if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 29a30a14c315..bdbdb1a7920a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -134,7 +134,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) int err; int tp_created = 0; - if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETTFILTER) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; replay: diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index eed8404443d8..f435a88d899a 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -188,6 +188,12 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; +static void tcindex_filter_result_init(struct tcindex_filter_result *r) +{ + memset(r, 0, sizeof(*r)); + tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, @@ -207,15 +213,11 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, return err; memcpy(&cp, p, sizeof(cp)); - memset(&new_filter_result, 0, sizeof(new_filter_result)); - tcf_exts_init(&new_filter_result.exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + tcindex_filter_result_init(&new_filter_result); + tcindex_filter_result_init(&cr); if (old_r) - memcpy(&cr, r, sizeof(cr)); - else { - memset(&cr, 0, sizeof(cr)); - tcf_exts_init(&cr.exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - } + cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -267,9 +269,14 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = -ENOMEM; if (!cp.perfect && !cp.h) { if (valid_perfect_hash(&cp)) { + int i; + cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); if (!cp.perfect) goto errout; + for (i = 0; i < cp.hash; i++) + tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + TCA_TCINDEX_POLICE); balloc = 1; } else { cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); @@ -295,14 +302,17 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, tcf_bind_filter(tp, &cr.res, base); } - tcf_exts_change(tp, &cr.exts, &e); + if (old_r) + tcf_exts_change(tp, &r->exts, &e); + else + tcf_exts_change(tp, &cr.exts, &e); tcf_tree_lock(tp); if (old_r && old_r != r) - memset(old_r, 0, sizeof(*old_r)); + tcindex_filter_result_init(old_r); memcpy(p, &cp, sizeof(cp)); - memcpy(r, &cr, sizeof(cr)); + r->res = cr.res; if (r == &new_filter_result) { struct tcindex_filter **fp; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a0b84e0e22de..400769014bbd 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1084,7 +1084,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) struct Qdisc *p = NULL; int err; - if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETQDISC) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1151,7 +1151,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) struct Qdisc *q, *p; int err; - if (!capable(CAP_NET_ADMIN)) + if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; replay: @@ -1490,7 +1490,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) u32 qid; int err; - if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETTCLASS) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index edee03d922e2..6e957c3b9854 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -553,11 +553,6 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - sch_tree_lock(sch); - - if (tb[TCA_HHF_BACKLOG_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); - if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); @@ -567,6 +562,12 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum > INT_MAX) return -EINVAL; + + sch_tree_lock(sch); + + if (tb[TCA_HHF_BACKLOG_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + q->quantum = new_quantum; q->hhf_non_hh_weight = new_hhf_non_hh_weight; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 4f6d6f9d1274..39579c3e0d14 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1395,35 +1395,44 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) return false; } -/* Update asoc's rwnd for the approximated state in the buffer, - * and check whether SACK needs to be sent. - */ -void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) +/* Increase asoc's rwnd by len and send any window update SACK if needed. */ +void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) { - int rx_count; struct sctp_chunk *sack; struct timer_list *timer; - if (asoc->ep->rcvbuf_policy) - rx_count = atomic_read(&asoc->rmem_alloc); - else - rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + if (asoc->rwnd_over) { + if (asoc->rwnd_over >= len) { + asoc->rwnd_over -= len; + } else { + asoc->rwnd += (len - asoc->rwnd_over); + asoc->rwnd_over = 0; + } + } else { + asoc->rwnd += len; + } - if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) - asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; - else - asoc->rwnd = 0; + /* If we had window pressure, start recovering it + * once our rwnd had reached the accumulated pressure + * threshold. The idea is to recover slowly, but up + * to the initial advertised window. + */ + if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { + int change = min(asoc->pathmtu, asoc->rwnd_press); + asoc->rwnd += change; + asoc->rwnd_press -= change; + } - pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n", - __func__, asoc, asoc->rwnd, rx_count, - asoc->base.sk->sk_rcvbuf); + pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", + __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, + asoc->a_rwnd); /* Send a window update SACK if the rwnd has increased by at least the * minimum of the association's PMTU and half of the receive buffer. * The algorithm used is similar to the one described in * Section 4.2.3.3 of RFC 1122. */ - if (update_peer && sctp_peer_needs_update(asoc)) { + if (sctp_peer_needs_update(asoc)) { asoc->a_rwnd = asoc->rwnd; pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " @@ -1445,6 +1454,45 @@ void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) } } +/* Decrease asoc's rwnd by len. */ +void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) +{ + int rx_count; + int over = 0; + + if (unlikely(!asoc->rwnd || asoc->rwnd_over)) + pr_debug("%s: association:%p has asoc->rwnd:%u, " + "asoc->rwnd_over:%u!\n", __func__, asoc, + asoc->rwnd, asoc->rwnd_over); + + if (asoc->ep->rcvbuf_policy) + rx_count = atomic_read(&asoc->rmem_alloc); + else + rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + + /* If we've reached or overflowed our receive buffer, announce + * a 0 rwnd if rwnd would still be positive. Store the + * the potential pressure overflow so that the window can be restored + * back to original value. + */ + if (rx_count >= asoc->base.sk->sk_rcvbuf) + over = 1; + + if (asoc->rwnd >= len) { + asoc->rwnd -= len; + if (over) { + asoc->rwnd_press += asoc->rwnd; + asoc->rwnd = 0; + } + } else { + asoc->rwnd_over = len - asoc->rwnd; + asoc->rwnd = 0; + } + + pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", + __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, + asoc->rwnd_press); +} /* Build the bind address list for the association based on info from the * local endpoint and the remote peer. diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 683c7d1b1306..0e8529113dc5 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -386,14 +386,13 @@ nomem: */ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_auth_bytes *secret; struct sctp_shared_key *ep_key; /* If we don't support AUTH, or peer is not capable * we don't need to do anything. */ - if (!net->sctp.auth_enable || !asoc->peer.auth_capable) + if (!asoc->ep->auth_enable || !asoc->peer.auth_capable) return 0; /* If the key_id is non-zero and we couldn't find an @@ -440,16 +439,16 @@ struct sctp_shared_key *sctp_auth_get_shkey( */ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) { - struct net *net = sock_net(ep->base.sk); struct crypto_hash *tfm = NULL; __u16 id; - /* if the transforms are already allocted, we are done */ - if (!net->sctp.auth_enable) { + /* If AUTH extension is disabled, we are done */ + if (!ep->auth_enable) { ep->auth_hmacs = NULL; return 0; } + /* If the transforms are already allocated, we are done */ if (ep->auth_hmacs) return 0; @@ -665,12 +664,10 @@ static int __sctp_auth_cid(sctp_cid_t chunk, struct sctp_chunks_param *param) /* Check if peer requested that this chunk is authenticated */ int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc) { - struct net *net; if (!asoc) return 0; - net = sock_net(asoc->base.sk); - if (!net->sctp.auth_enable || !asoc->peer.auth_capable) + if (!asoc->ep->auth_enable || !asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); @@ -679,12 +676,10 @@ int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc) /* Check if we requested that peer authenticate this chunk. */ int sctp_auth_recv_cid(sctp_cid_t chunk, const struct sctp_association *asoc) { - struct net *net; if (!asoc) return 0; - net = sock_net(asoc->base.sk); - if (!net->sctp.auth_enable) + if (!asoc->ep->auth_enable) return 0; return __sctp_auth_cid(chunk, diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 8e5fdea05216..3d9f429858dc 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -68,7 +68,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, if (!ep->digest) return NULL; - if (net->sctp.auth_enable) { + ep->auth_enable = net->sctp.auth_enable; + if (ep->auth_enable) { /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4e1d0fcb028e..44cbb54c8574 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -491,8 +491,13 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, continue; if ((laddr->state == SCTP_ADDR_SRC) && (AF_INET == laddr->a.sa.sa_family)) { - fl4->saddr = laddr->a.v4.sin_addr.s_addr; fl4->fl4_sport = laddr->a.v4.sin_port; + flowi4_update_output(fl4, + asoc->base.sk->sk_bound_dev_if, + RT_CONN_FLAGS(asoc->base.sk), + daddr->v4.sin_addr.s_addr, + laddr->a.v4.sin_addr.s_addr); + rt = ip_route_output_key(sock_net(sk), fl4); if (!IS_ERR(rt)) { dst = &rt->dst; @@ -957,7 +962,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(skb, &transport->fl); + return ip_queue_xmit(&inet->sk, skb, &transport->fl); } static struct sctp_af sctp_af_inet; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 3a1767ef3201..fee5552ddf92 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -219,6 +219,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, gfp_t gfp, int vparam_len) { struct net *net = sock_net(asoc->base.sk); + struct sctp_endpoint *ep = asoc->ep; sctp_inithdr_t init; union sctp_params addrs; size_t chunksize; @@ -278,7 +279,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize += vparam_len; /* Account for AUTH related parameters */ - if (net->sctp.auth_enable) { + if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); @@ -363,7 +364,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, } /* Add SCTP-AUTH chunks to the parameter list */ - if (net->sctp.auth_enable) { + if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) @@ -2010,7 +2011,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc, /* if the peer reports AUTH, assume that he * supports AUTH. */ - if (net->sctp.auth_enable) + if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: @@ -2102,6 +2103,7 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, * SCTP_IERROR_NO_ERROR - continue with the chunk */ static sctp_ierror_t sctp_verify_param(struct net *net, + const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, sctp_cid_t cid, @@ -2152,7 +2154,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, goto fallthrough; case SCTP_PARAM_RANDOM: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fallthrough; /* SCTP-AUTH: Secion 6.1 @@ -2169,7 +2171,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_CHUNKS: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fallthrough; /* SCTP-AUTH: Section 3.2 @@ -2185,7 +2187,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_HMAC_ALGO: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fallthrough; hmacs = (struct sctp_hmac_algo_param *)param.p; @@ -2220,10 +2222,9 @@ fallthrough: } /* Verify the INIT packet before we process it. */ -int sctp_verify_init(struct net *net, const struct sctp_association *asoc, - sctp_cid_t cid, - sctp_init_chunk_t *peer_init, - struct sctp_chunk *chunk, +int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, sctp_cid_t cid, + sctp_init_chunk_t *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; @@ -2264,8 +2265,8 @@ int sctp_verify_init(struct net *net, const struct sctp_association *asoc, /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init, init_hdr.params) { - - result = sctp_verify_param(net, asoc, param, cid, chunk, errp); + result = sctp_verify_param(net, ep, asoc, param, cid, + chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: @@ -2497,6 +2498,7 @@ static int sctp_process_param(struct sctp_association *asoc, struct sctp_af *af; union sctp_addr_param *addr_param; struct sctp_transport *t; + struct sctp_endpoint *ep = asoc->ep; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters @@ -2636,7 +2638,7 @@ do_addr_param: goto fall_through; case SCTP_PARAM_RANDOM: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ @@ -2649,7 +2651,7 @@ do_addr_param: break; case SCTP_PARAM_HMAC_ALGO: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ @@ -2665,7 +2667,7 @@ do_addr_param: break; case SCTP_PARAM_CHUNKS: - if (!net->sctp.auth_enable) + if (!ep->auth_enable) goto fall_through; asoc->peer.peer_chunks = kmemdup(param.p, diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 5d6883ff00c3..fef2acdf4a2e 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -496,11 +496,10 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands, /* If the transport error count is greater than the pf_retrans * threshold, and less than pathmaxrtx, and if the current state - * is not SCTP_UNCONFIRMED, then mark this transport as Partially - * Failed, see SCTP Quick Failover Draft, section 5.1 + * is SCTP_ACTIVE, then mark this transport as Partially Failed, + * see SCTP Quick Failover Draft, section 5.1 */ - if ((transport->state != SCTP_PF) && - (transport->state != SCTP_UNCONFIRMED) && + if ((transport->state == SCTP_ACTIVE) && (asoc->pf_retrans < transport->pathmaxrxt) && (transport->error_count > asoc->pf_retrans)) { diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 01e002430c85..5170a1ff95a1 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -357,7 +357,7 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net, /* Verify the INIT chunk before processing it. */ err_chunk = NULL; - if (!sctp_verify_init(net, asoc, chunk->chunk_hdr->type, + if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { /* This chunk contains fatal error. It is to be discarded. @@ -524,7 +524,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net, /* Verify the INIT chunk before processing it. */ err_chunk = NULL; - if (!sctp_verify_init(net, asoc, chunk->chunk_hdr->type, + if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { @@ -1430,7 +1430,7 @@ static sctp_disposition_t sctp_sf_do_unexpected_init( /* Verify the INIT chunk before processing it. */ err_chunk = NULL; - if (!sctp_verify_init(net, asoc, chunk->chunk_hdr->type, + if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { /* This chunk contains fatal error. It is to be discarded. @@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * PMTU. In cases, such as loopback, this might be a rather * large spill over. */ - if ((!chunk->data_accepted) && (!asoc->rwnd || + if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over || (datalen > asoc->rwnd + asoc->frag_point))) { /* If this is the next TSN, consider reneging to make diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e13519e9df80..fee06b99a4da 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sctp_skb_pull(skb, copied); skb_queue_head(&sk->sk_receive_queue, skb); + /* When only partial message is copied to the user, increase + * rwnd by that amount. If all the data in the skb is read, + * rwnd is updated when the event is freed. + */ + if (!sctp_ulpevent_is_notification(event)) + sctp_assoc_rwnd_increase(event->asoc, copied); goto out; } else if ((event->msg_flags & MSG_NOTIFICATION) || (event->msg_flags & MSG_EOR)) @@ -3315,10 +3321,10 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunk val; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authchunk)) @@ -3335,7 +3341,7 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, } /* add this chunk id to the endpoint */ - return sctp_auth_ep_add_chunkid(sctp_sk(sk)->ep, val.sauth_chunk); + return sctp_auth_ep_add_chunkid(ep, val.sauth_chunk); } /* @@ -3348,12 +3354,12 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_hmacalgo *hmacs; u32 idents; int err; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen < sizeof(struct sctp_hmacalgo)) @@ -3370,7 +3376,7 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, goto out; } - err = sctp_auth_ep_set_hmacs(sctp_sk(sk)->ep, hmacs); + err = sctp_auth_ep_set_hmacs(ep, hmacs); out: kfree(hmacs); return err; @@ -3386,12 +3392,12 @@ static int sctp_setsockopt_auth_key(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkey *authkey; struct sctp_association *asoc; int ret; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen <= sizeof(struct sctp_authkey)) @@ -3412,7 +3418,7 @@ static int sctp_setsockopt_auth_key(struct sock *sk, goto out; } - ret = sctp_auth_set_key(sctp_sk(sk)->ep, asoc, authkey); + ret = sctp_auth_set_key(ep, asoc, authkey); out: kzfree(authkey); return ret; @@ -3428,11 +3434,11 @@ static int sctp_setsockopt_active_key(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authkeyid)) @@ -3444,8 +3450,7 @@ static int sctp_setsockopt_active_key(struct sock *sk, if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - return sctp_auth_set_active_key(sctp_sk(sk)->ep, asoc, - val.scact_keynumber); + return sctp_auth_set_active_key(ep, asoc, val.scact_keynumber); } /* @@ -3457,11 +3462,11 @@ static int sctp_setsockopt_del_key(struct sock *sk, char __user *optval, unsigned int optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authkeyid)) @@ -3473,8 +3478,7 @@ static int sctp_setsockopt_del_key(struct sock *sk, if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - return sctp_auth_del_key_id(sctp_sk(sk)->ep, asoc, - val.scact_keynumber); + return sctp_auth_del_key_id(ep, asoc, val.scact_keynumber); } @@ -5381,16 +5385,16 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_hmacalgo __user *p = (void __user *)optval; struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; - hmacs = sctp_sk(sk)->ep->auth_hmacs_list; + hmacs = ep->auth_hmacs_list; data_len = ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t); if (len < sizeof(struct sctp_hmacalgo) + data_len) @@ -5411,11 +5415,11 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, static int sctp_getsockopt_active_key(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (len < sizeof(struct sctp_authkeyid)) @@ -5430,7 +5434,7 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (asoc) val.scact_keynumber = asoc->active_key_id; else - val.scact_keynumber = sctp_sk(sk)->ep->active_key_id; + val.scact_keynumber = ep->active_key_id; len = sizeof(struct sctp_authkeyid); if (put_user(len, optlen)) @@ -5444,7 +5448,7 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; @@ -5452,7 +5456,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, u32 num_chunks = 0; char __user *to; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (len < sizeof(struct sctp_authchunks)) @@ -5489,7 +5493,7 @@ num: static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct net *net = sock_net(sk); + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; @@ -5497,7 +5501,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, u32 num_chunks = 0; char __user *to; - if (!net->sctp.auth_enable) + if (!ep->auth_enable) return -EACCES; if (len < sizeof(struct sctp_authchunks)) @@ -5514,7 +5518,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, if (asoc) ch = (struct sctp_chunks_param *)asoc->c.auth_chunks; else - ch = sctp_sk(sk)->ep->auth_chunk_list; + ch = ep->auth_chunk_list; if (!ch) goto num; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 35c8923b5554..c82fdc1eab7c 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -64,6 +64,9 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_auth(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -266,7 +269,7 @@ static struct ctl_table sctp_net_table[] = { .data = &init_net.sctp.auth_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_sctp_do_auth, }, { .procname = "addr_scope_policy", @@ -400,6 +403,37 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, return ret; } +static int proc_sctp_do_auth(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + struct ctl_table tbl; + int new_value, ret; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.auth_enable; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + + if (write) { + struct sock *sk = net->sctp.ctl_sock; + + net->sctp.auth_enable = new_value; + /* Update the value in the control socket */ + lock_sock(sk); + sctp_sk(sk)->ep->auth_enable = new_value; + release_sock(sk); + } + + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table = sctp_net_table; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 8d198ae03606..85c64658bd0b 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, skb = sctp_event2skb(event); /* Set the owner and charge rwnd for bytes received. */ sctp_ulpevent_set_owner(event, asoc); - sctp_assoc_rwnd_update(asoc, false); + sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); if (!skb->data_len) return; @@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; unsigned int len; - struct sctp_association *asoc; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as @@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) } done: - asoc = event->asoc; - sctp_association_hold(asoc); + sctp_assoc_rwnd_increase(event->asoc, len); sctp_ulpevent_release_owner(event); - sctp_assoc_rwnd_update(asoc, true); - sctp_association_put(asoc); } static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) diff --git a/net/socket.c b/net/socket.c index 1b1e7e6a960f..abf56b2a14f9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1880,8 +1880,8 @@ out: * Receive a datagram from a socket. */ -asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, - unsigned int flags) +SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, + unsigned int, flags) { return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5285ead196c0..247e973544bf 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -296,7 +296,7 @@ static void rpcauth_unhash_cred_locked(struct rpc_cred *cred) { hlist_del_rcu(&cred->cr_hash); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 36e431ee1c90..b6e440baccc3 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -143,7 +143,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 0f73f4507746..4ce5eccec1f6 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1503,6 +1503,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (unwrap_integ_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; + rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE; break; case RPC_GSS_SVC_PRIVACY: /* placeholders for length and seq. number: */ @@ -1511,6 +1512,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (unwrap_priv_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; + rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2; break; default: goto auth_err; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 3513d559bc45..9761a0da964d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -244,10 +244,10 @@ void xprt_free_bc_request(struct rpc_rqst *req) dprintk("RPC: free backchannel req=%p\n", req); req->rq_connect_cookie = xprt->connect_cookie - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (!xprt_need_to_requeue(xprt)) { /* diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index ae333c1845bb..066362141133 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -374,7 +374,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) } return; out: - printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); + printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index 14c9f6d1c5ff..f2b7cb540e61 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -43,6 +43,19 @@ static inline int rpc_reply_expected(struct rpc_task *task) (task->tk_msg.rpc_proc->p_decode != NULL); } +static inline int sock_is_loopback(struct sock *sk) +{ + struct dst_entry *dst; + int loopback = 0; + rcu_read_lock(); + dst = rcu_dereference(sk->sk_dst_cache); + if (dst && dst->dev && + (dst->dev->features & NETIF_F_LOOPBACK)) + loopback = 1; + rcu_read_unlock(); + return loopback; +} + int svc_send_common(struct socket *sock, struct xdr_buf *xdr, struct page *headpage, unsigned long headoffset, struct page *tailpage, unsigned long tailoffset); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 06c6ff0cb911..b4737fbdec13 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -597,6 +597,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) } rqstp->rq_pages[i] = p; } + rqstp->rq_page_end = &rqstp->rq_pages[i]; rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ /* Make arg->head point to first page and arg->pages point to rest */ @@ -730,6 +731,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) newxpt = xprt->xpt_ops->xpo_accept(xprt); if (newxpt) svc_add_new_temp_xprt(serv, newxpt); + else + module_put(xprt->xpt_class->xcl_owner); } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", @@ -793,7 +796,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) clear_bit(XPT_OLD, &xprt->xpt_flags); - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp); rqstp->rq_chandle.defer = svc_defer; if (serv->sv_stats) diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 2af7b0cba43a..79c0f3459b5c 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -54,6 +54,8 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) } spin_unlock(&authtab_lock); + rqstp->rq_auth_slack = 0; + rqstp->rq_authop = aops; return aops->accept(rqstp, authp); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 43bcb4699d69..b507cd327d9b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -400,6 +400,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, release_sock(sock->sk); #endif } + +static int svc_sock_secure_port(struct svc_rqst *rqstp) +{ + return svc_port_is_privileged(svc_addr(rqstp)); +} + /* * INET callback when data has been received on the socket. */ @@ -678,6 +684,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_udp_class = { @@ -842,8 +849,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) * tell us anything. For now just warn about unpriv connections. */ if (!svc_port_is_privileged(sin)) { - dprintk(KERN_WARNING - "%s: connect from unprivileged port: %s\n", + dprintk("%s: connect from unprivileged port: %s\n", serv->sv_name, __svc_print_addr(sin, buf, sizeof(buf))); } @@ -867,6 +873,10 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); + if (sock_is_loopback(newsock->sk)) + set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags); + else + clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags); if (serv->sv_stats) serv->sv_stats->nettcpconn++; @@ -1112,6 +1122,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; + rqstp->rq_local = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; @@ -1234,6 +1245,7 @@ static struct svc_xprt_ops svc_tcp_bc_ops = { .xpo_detach = svc_bc_tcp_sock_detach, .xpo_free = svc_bc_sock_free, .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_tcp_bc_class = { @@ -1272,6 +1284,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_tcp_class = { diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index dd97ba3c4456..23fb4e75e245 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -462,6 +462,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; + xdr_set_scratch_buffer(xdr, NULL, 0); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; @@ -482,6 +483,73 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_encode); /** + * xdr_commit_encode - Ensure all data is written to buffer + * @xdr: pointer to xdr_stream + * + * We handle encoding across page boundaries by giving the caller a + * temporary location to write to, then later copying the data into + * place; xdr_commit_encode does that copying. + * + * Normally the caller doesn't need to call this directly, as the + * following xdr_reserve_space will do it. But an explicit call may be + * required at the end of encoding, or any other time when the xdr_buf + * data might be read. + */ +void xdr_commit_encode(struct xdr_stream *xdr) +{ + int shift = xdr->scratch.iov_len; + void *page; + + if (shift == 0) + return; + page = page_address(*xdr->page_ptr); + memcpy(xdr->scratch.iov_base, page, shift); + memmove(page, page + shift, (void *)xdr->p - page); + xdr->scratch.iov_len = 0; +} +EXPORT_SYMBOL_GPL(xdr_commit_encode); + +__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) +{ + static __be32 *p; + int space_left; + int frag1bytes, frag2bytes; + + if (nbytes > PAGE_SIZE) + return NULL; /* Bigger buffers require special handling */ + if (xdr->buf->len + nbytes > xdr->buf->buflen) + return NULL; /* Sorry, we're totally out of space */ + frag1bytes = (xdr->end - xdr->p) << 2; + frag2bytes = nbytes - frag1bytes; + if (xdr->iov) + xdr->iov->iov_len += frag1bytes; + else + xdr->buf->page_len += frag1bytes; + xdr->page_ptr++; + xdr->iov = NULL; + /* + * If the last encode didn't end exactly on a page boundary, the + * next one will straddle boundaries. Encode into the next + * page, then copy it back later in xdr_commit_encode. We use + * the "scratch" iov to track any temporarily unused fragment of + * space at the end of the previous buffer: + */ + xdr->scratch.iov_base = xdr->p; + xdr->scratch.iov_len = frag1bytes; + p = page_address(*xdr->page_ptr); + /* + * Note this is where the next encode will start after we've + * shifted this one back: + */ + xdr->p = (void *)p + frag2bytes; + space_left = xdr->buf->buflen - xdr->buf->len; + xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE); + xdr->buf->page_len += frag2bytes; + xdr->buf->len += nbytes; + return p; +} + +/** * xdr_reserve_space - Reserve buffer space for sending * @xdr: pointer to xdr_stream * @nbytes: number of bytes to reserve @@ -495,20 +563,122 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) __be32 *p = xdr->p; __be32 *q; + xdr_commit_encode(xdr); /* align nbytes on the next 32-bit boundary */ nbytes += 3; nbytes &= ~3; q = p + (nbytes >> 2); if (unlikely(q > xdr->end || q < p)) - return NULL; + return xdr_get_next_encode_buffer(xdr, nbytes); xdr->p = q; - xdr->iov->iov_len += nbytes; + if (xdr->iov) + xdr->iov->iov_len += nbytes; + else + xdr->buf->page_len += nbytes; xdr->buf->len += nbytes; return p; } EXPORT_SYMBOL_GPL(xdr_reserve_space); /** + * xdr_truncate_encode - truncate an encode buffer + * @xdr: pointer to xdr_stream + * @len: new length of buffer + * + * Truncates the xdr stream, so that xdr->buf->len == len, + * and xdr->p points at offset len from the start of the buffer, and + * head, tail, and page lengths are adjusted to correspond. + * + * If this means moving xdr->p to a different buffer, we assume that + * that the end pointer should be set to the end of the current page, + * except in the case of the head buffer when we assume the head + * buffer's current length represents the end of the available buffer. + * + * This is *not* safe to use on a buffer that already has inlined page + * cache pages (as in a zero-copy server read reply), except for the + * simple case of truncating from one position in the tail to another. + * + */ +void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + int fraglen; + int new, old; + + if (len > buf->len) { + WARN_ON_ONCE(1); + return; + } + xdr_commit_encode(xdr); + + fraglen = min_t(int, buf->len - len, tail->iov_len); + tail->iov_len -= fraglen; + buf->len -= fraglen; + if (tail->iov_len && buf->len == len) { + xdr->p = tail->iov_base + tail->iov_len; + /* xdr->end, xdr->iov should be set already */ + return; + } + WARN_ON_ONCE(fraglen); + fraglen = min_t(int, buf->len - len, buf->page_len); + buf->page_len -= fraglen; + buf->len -= fraglen; + + new = buf->page_base + buf->page_len; + old = new + fraglen; + xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT); + + if (buf->page_len && buf->len == len) { + xdr->p = page_address(*xdr->page_ptr); + xdr->end = (void *)xdr->p + PAGE_SIZE; + xdr->p = (void *)xdr->p + (new % PAGE_SIZE); + /* xdr->iov should already be NULL */ + return; + } + if (fraglen) { + xdr->end = head->iov_base + head->iov_len; + xdr->page_ptr--; + } + /* (otherwise assume xdr->end is already set) */ + head->iov_len = len; + buf->len = len; + xdr->p = head->iov_base + head->iov_len; + xdr->iov = buf->head; +} +EXPORT_SYMBOL(xdr_truncate_encode); + +/** + * xdr_restrict_buflen - decrease available buffer space + * @xdr: pointer to xdr_stream + * @newbuflen: new maximum number of bytes available + * + * Adjust our idea of how much space is available in the buffer. + * If we've already used too much space in the buffer, returns -1. + * If the available space is already smaller than newbuflen, returns 0 + * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen + * and ensures xdr->end is set at most offset newbuflen from the start + * of the buffer. + */ +int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen) +{ + struct xdr_buf *buf = xdr->buf; + int left_in_this_buf = (void *)xdr->end - (void *)xdr->p; + int end_offset = buf->len + left_in_this_buf; + + if (newbuflen < 0 || newbuflen < buf->len) + return -1; + if (newbuflen > buf->buflen) + return 0; + if (newbuflen < end_offset) + xdr->end = (void *)xdr->end + newbuflen - end_offset; + buf->buflen = newbuflen; + return 0; +} +EXPORT_SYMBOL(xdr_restrict_buflen); + +/** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending * @xdr: pointer to xdr_stream * @pages: list of pages diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2d1d5a643b95..c3b2b3369e52 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -212,9 +212,9 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } else queue_work(rpciod_workqueue, &xprt->task_cleanup); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 8d904e4eef15..8f92a61ee2df 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* Set up the XDR head */ rqstp->rq_arg.head[0].iov_base = page_address(page); - rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); + rqstp->rq_arg.head[0].iov_len = + min_t(size_t, byte_count, ctxt->sge[0].length); rqstp->rq_arg.len = byte_count; rqstp->rq_arg.buflen = byte_count; @@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, page = ctxt->pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; - bc -= min(bc, ctxt->sge[sge_no].length); + bc -= min_t(u32, bc, ctxt->sge[sge_no].length); rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; sge_no++; } @@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -/* Encode a read-chunk-list as an array of IB SGE - * - * Assumptions: - * - chunk[0]->position points to pages[0] at an offset of 0 - * - pages[] is not physically or virtually contiguous and consists of - * PAGE_SIZE elements. - * - * Output: - * - sge array pointing into pages[] array. - * - chunk_sge array specifying sge index and count for each - * chunk in the read list - * - */ -static int map_read_chunks(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - struct rpcrdma_msg *rmsgp, - struct svc_rdma_req_map *rpl_map, - struct svc_rdma_req_map *chl_map, - int ch_count, - int byte_count) +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) { - int sge_no; - int sge_bytes; - int page_off; - int page_no; - int ch_bytes; - int ch_no; - struct rpcrdma_read_chunk *ch; + if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == + RDMA_TRANSPORT_IWARP) + return 1; + else + return min_t(int, sge_count, xprt->sc_max_sge); +} - sge_no = 0; - page_no = 0; - page_off = 0; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch_no = 0; - ch_bytes = ntohl(ch->rc_target.rs_length); - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; - head->hdr_count = head->count; /* save count of hdr pages */ - head->arg.page_base = 0; - head->arg.page_len = ch_bytes; - head->arg.len = rqstp->rq_arg.len + ch_bytes; - head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; - head->count++; - chl_map->ch[0].start = 0; - while (byte_count) { - rpl_map->sge[sge_no].iov_base = - page_address(rqstp->rq_arg.pages[page_no]) + page_off; - sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); - rpl_map->sge[sge_no].iov_len = sge_bytes; - /* - * Don't bump head->count here because the same page - * may be used by multiple SGE. - */ - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; +typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last); + +/* Issue an RDMA_READ using the local lkey to map the data sink */ +static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) +{ + struct ib_send_wr read_wr; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; + + ctxt->direction = DMA_FROM_DEVICE; + ctxt->read_hdr = head; + pages_needed = + min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; rqstp->rq_next_page = rqstp->rq_respages + 1; + ctxt->sge[pno].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], pg_off, + PAGE_SIZE - pg_off, + DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[pno].addr); + if (ret) + goto err; + atomic_inc(&xprt->sc_dma_used); - byte_count -= sge_bytes; - ch_bytes -= sge_bytes; - sge_no++; - /* - * If all bytes for this chunk have been mapped to an - * SGE, move to the next SGE - */ - if (ch_bytes == 0) { - chl_map->ch[ch_no].count = - sge_no - chl_map->ch[ch_no].start; - ch_no++; - ch++; - chl_map->ch[ch_no].start = sge_no; - ch_bytes = ntohl(ch->rc_target.rs_length); - /* If bytes remaining account for next chunk */ - if (byte_count) { - head->arg.page_len += ch_bytes; - head->arg.len += ch_bytes; - head->arg.buflen += ch_bytes; - } + /* The lkey here is either a local dma lkey or a dma_mr lkey */ + ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].length = len; + ctxt->count++; + + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; } - /* - * If this SGE consumed all of the page, move to the - * next page - */ - if ((sge_bytes + page_off) == PAGE_SIZE) { - page_no++; - page_off = 0; - /* - * If there are still bytes left to map, bump - * the page count - */ - if (byte_count) - head->count++; - } else - page_off += sge_bytes; + rs_length -= len; } - BUG_ON(byte_count != 0); - return sge_no; + + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = pages_needed; + + ret = svc_rdma_send(xprt, &read_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; + } + + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + return ret; } -/* Map a read-chunk-list to an XDR and fast register the page-list. - * - * Assumptions: - * - chunk[0] position points to pages[0] at an offset of 0 - * - pages[] will be made physically contiguous by creating a one-off memory - * region using the fastreg verb. - * - byte_count is # of bytes in read-chunk-list - * - ch_count is # of chunks in read-chunk-list - * - * Output: - * - sge array pointing into pages[] array. - * - chunk_sge array specifying sge index and count for each - * chunk in the read list - */ -static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, +/* Issue an RDMA_READ using an FRMR to map the data sink */ +static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, - struct rpcrdma_msg *rmsgp, - struct svc_rdma_req_map *rpl_map, - struct svc_rdma_req_map *chl_map, - int ch_count, - int byte_count) + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) { - int page_no; - int ch_no; - u32 offset; - struct rpcrdma_read_chunk *ch; - struct svc_rdma_fastreg_mr *frmr; - int ret = 0; + struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; + struct ib_send_wr fastreg_wr; + u8 key; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; - frmr = svc_rdma_get_frmr(xprt); if (IS_ERR(frmr)) return -ENOMEM; - head->frmr = frmr; - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; - head->hdr_count = head->count; /* save count of hdr pages */ - head->arg.page_base = 0; - head->arg.page_len = byte_count; - head->arg.len = rqstp->rq_arg.len + byte_count; - head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = frmr; + pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); - /* Fast register the page list */ - frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); frmr->direction = DMA_FROM_DEVICE; frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); - frmr->map_len = byte_count; - frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; - for (page_no = 0; page_no < frmr->page_list_len; page_no++) { - frmr->page_list->page_list[page_no] = + frmr->map_len = pages_needed << PAGE_SHIFT; + frmr->page_list_len = pages_needed; + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + frmr->page_list->page_list[pno] = ib_dma_map_page(xprt->sc_cm_id->device, - rqstp->rq_arg.pages[page_no], 0, + head->arg.pages[pg_no], 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[pno]); + if (ret) + goto err; atomic_inc(&xprt->sc_dma_used); - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - } - head->count += page_no; - - /* rq_respages points one past arg pages */ - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Create the reply and chunk maps */ - offset = 0; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - for (ch_no = 0; ch_no < ch_count; ch_no++) { - int len = ntohl(ch->rc_target.rs_length); - rpl_map->sge[ch_no].iov_base = frmr->kva + offset; - rpl_map->sge[ch_no].iov_len = len; - chl_map->ch[ch_no].count = 1; - chl_map->ch[ch_no].start = ch_no; - offset += len; - ch++; + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; + } + rs_length -= len; } - ret = svc_rdma_fastreg(xprt, frmr); - if (ret) - goto fatal_err; - - return ch_no; - - fatal_err: - printk("svcrdma: error fast registering xdr for xprt %p", xprt); - svc_rdma_put_frmr(xprt, frmr); - return -EIO; -} - -static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, - struct svc_rdma_op_ctxt *ctxt, - struct svc_rdma_fastreg_mr *frmr, - struct kvec *vec, - u64 *sgl_offset, - int count) -{ - int i; - unsigned long off; + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->count = count; - ctxt->direction = DMA_FROM_DEVICE; - for (i = 0; i < count; i++) { - ctxt->sge[i].length = 0; /* in case map fails */ - if (!frmr) { - BUG_ON(!virt_to_page(vec[i].iov_base)); - off = (unsigned long)vec[i].iov_base & ~PAGE_MASK; - ctxt->sge[i].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - virt_to_page(vec[i].iov_base), - off, - vec[i].iov_len, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - ctxt->sge[i].addr)) - return -EINVAL; - ctxt->sge[i].lkey = xprt->sc_dma_lkey; - atomic_inc(&xprt->sc_dma_used); - } else { - ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; - ctxt->sge[i].lkey = frmr->mr->lkey; - } - ctxt->sge[i].length = vec[i].iov_len; - *sgl_offset = *sgl_offset + vec[i].iov_len; + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset; + ctxt->sge[0].lkey = frmr->mr->lkey; + ctxt->sge[0].length = read; + ctxt->count = 1; + ctxt->read_hdr = head; + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + fastreg_wr.next = &read_wr; + + /* Prepare RDMA_READ */ + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = 1; + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + read_wr.wr_id = (unsigned long)ctxt; + read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; + } else { + read_wr.opcode = IB_WR_RDMA_READ; + read_wr.next = &inv_wr; + /* Prepare invalidate */ + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = (unsigned long)ctxt; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; + inv_wr.ex.invalidate_rkey = frmr->mr->lkey; + } + ctxt->wr_op = read_wr.opcode; + + /* Post the chain */ + ret = svc_rdma_send(xprt, &fastreg_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; } - return 0; -} -static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) -{ - if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == - RDMA_TRANSPORT_IWARP) && - sge_count > 1) - return 1; - else - return min_t(int, sge_count, xprt->sc_max_sge); + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + svc_rdma_put_frmr(xprt, frmr); + return ret; } -/* - * Use RDMA_READ to read data from the advertised client buffer into the - * XDR stream starting at rq_arg.head[0].iov_base. - * Each chunk in the array - * contains the following fields: - * discrim - '1', This isn't used for data placement - * position - The xdr stream offset (the same for every chunk) - * handle - RMR for client memory region - * length - data transfer length - * offset - 64 bit tagged offset in remote memory region - * - * On our side, we need to read into a pagelist. The first page immediately - * follows the RPC header. - * - * This function returns: - * 0 - No error and no read-list found. - * - * 1 - Successful read-list processing. The data is not yet in - * the pagelist and therefore the RPC request must be deferred. The - * I/O completion will enqueue the transport again and - * svc_rdma_recvfrom will complete the request. - * - * <0 - Error processing/posting read-list. - * - * NOTE: The ctxt must not be touched after the last WR has been posted - * because the I/O completion processing may occur on another - * processor and free / modify the context. Ne touche pas! - */ -static int rdma_read_xdr(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *hdr_ctxt) +static int rdma_read_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) { - struct ib_send_wr read_wr; - struct ib_send_wr inv_wr; - int err = 0; - int ch_no; - int ch_count; - int byte_count; - int sge_count; - u64 sgl_offset; + int page_no, ch_count, ret; struct rpcrdma_read_chunk *ch; - struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_req_map *rpl_map; - struct svc_rdma_req_map *chl_map; + u32 page_offset, byte_count; + u64 rs_offset; + rdma_reader_fn reader; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); @@ -408,122 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; - /* Allocate temporary reply and chunk maps */ - rpl_map = svc_rdma_get_req_map(); - chl_map = svc_rdma_get_req_map(); + /* The request is completed when the RDMA_READs complete. The + * head context keeps all the pages that comprise the + * request. + */ + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; + head->arg.page_base = 0; + head->arg.page_len = 0; + head->arg.len = rqstp->rq_arg.len; + head->arg.buflen = rqstp->rq_arg.buflen; - if (!xprt->sc_frmr_pg_list_len) - sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, ch_count, - byte_count); + /* Use FRMR if supported */ + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) + reader = rdma_read_chunk_frmr; else - sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, ch_count, - byte_count); - if (sge_count < 0) { - err = -EIO; - goto out; - } - - sgl_offset = 0; - ch_no = 0; + reader = rdma_read_chunk_lcl; + page_no = 0; page_offset = 0; for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch->rc_discrim != 0; ch++, ch_no++) { - u64 rs_offset; -next_sge: - ctxt = svc_rdma_get_context(xprt); - ctxt->direction = DMA_FROM_DEVICE; - ctxt->frmr = hdr_ctxt->frmr; - ctxt->read_hdr = NULL; - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + ch->rc_discrim != 0; ch++) { - /* Prepare READ WR */ - memset(&read_wr, 0, sizeof read_wr); - read_wr.wr_id = (unsigned long)ctxt; - read_wr.opcode = IB_WR_RDMA_READ; - ctxt->wr_op = read_wr.opcode; - read_wr.send_flags = IB_SEND_SIGNALED; - read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle); xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, &rs_offset); - read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset; - read_wr.sg_list = ctxt->sge; - read_wr.num_sge = - rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); - err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, - &rpl_map->sge[chl_map->ch[ch_no].start], - &sgl_offset, - read_wr.num_sge); - if (err) { - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - goto out; - } - if (((ch+1)->rc_discrim == 0) && - (read_wr.num_sge == chl_map->ch[ch_no].count)) { - /* - * Mark the last RDMA_READ with a bit to - * indicate all RPC data has been fetched from - * the client and the RPC needs to be enqueued. - */ - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - if (hdr_ctxt->frmr) { - set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); - /* - * Invalidate the local MR used to map the data - * sink. - */ - if (xprt->sc_dev_caps & - SVCRDMA_DEVCAP_READ_W_INV) { - read_wr.opcode = - IB_WR_RDMA_READ_WITH_INV; - ctxt->wr_op = read_wr.opcode; - read_wr.ex.invalidate_rkey = - ctxt->frmr->mr->lkey; - } else { - /* Prepare INVALIDATE WR */ - memset(&inv_wr, 0, sizeof inv_wr); - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = - hdr_ctxt->frmr->mr->lkey; - read_wr.next = &inv_wr; - } - } - ctxt->read_hdr = hdr_ctxt; - } - /* Post the read */ - err = svc_rdma_send(xprt, &read_wr); - if (err) { - printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", - err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - goto out; + byte_count = ntohl(ch->rc_target.rs_length); + + while (byte_count > 0) { + ret = reader(xprt, rqstp, head, + &page_no, &page_offset, + ntohl(ch->rc_target.rs_handle), + byte_count, rs_offset, + ((ch+1)->rc_discrim == 0) /* last */ + ); + if (ret < 0) + goto err; + byte_count -= ret; + rs_offset += ret; + head->arg.buflen += ret; } - atomic_inc(&rdma_stat_read); - - if (read_wr.num_sge < chl_map->ch[ch_no].count) { - chl_map->ch[ch_no].count -= read_wr.num_sge; - chl_map->ch[ch_no].start += read_wr.num_sge; - goto next_sge; - } - sgl_offset = 0; - err = 1; } - - out: - svc_rdma_put_req_map(rpl_map); - svc_rdma_put_req_map(chl_map); - + ret = 1; + err: /* Detach arg pages. svc_recv will replenish them */ - for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) - rqstp->rq_pages[ch_no] = NULL; + for (page_no = 0; + &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) + rqstp->rq_pages[page_no] = NULL; - return err; + return ret; } static int rdma_read_complete(struct svc_rqst *rqstp, @@ -595,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_op_ctxt, dto_q); list_del_init(&ctxt->dto_q); - } - if (ctxt) { spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); return rdma_read_complete(rqstp, ctxt); - } - - if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, struct svc_rdma_op_ctxt, dto_q); @@ -621,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) goto close_out; - BUG_ON(ret); goto out; } dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", @@ -644,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } /* Read read-list data. */ - ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); + ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { /* read-list posted, defer until data received from client. */ goto defer; - } - if (ret < 0) { + } else if (ret < 0) { /* Post of read-list failed, free context. */ svc_rdma_put_context(ctxt, 1); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 7e024a51617e..49fd21a5c215 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -49,152 +50,6 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* Encode an XDR as an array of IB SGE - * - * Assumptions: - * - head[0] is physically contiguous. - * - tail[0] is physically contiguous. - * - pages[] is not physically or virtually contiguous and consists of - * PAGE_SIZE elements. - * - * Output: - * SGE[0] reserved for RCPRDMA header - * SGE[1] data from xdr->head[] - * SGE[2..sge_count-2] data from xdr->pages[] - * SGE[sge_count-1] data from xdr->tail. - * - * The max SGE we need is the length of the XDR / pagesize + one for - * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES - * reserves a page for both the request and the reply header, and this - * array is only concerned with the reply we are assured that we have - * on extra page for the RPCRMDA header. - */ -static int fast_reg_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) -{ - int sge_no; - u32 sge_bytes; - u32 page_bytes; - u32 page_off; - int page_no = 0; - u8 *frva; - struct svc_rdma_fastreg_mr *frmr; - - frmr = svc_rdma_get_frmr(xprt); - if (IS_ERR(frmr)) - return -ENOMEM; - vec->frmr = frmr; - - /* Skip the RPCRDMA header */ - sge_no = 1; - - /* Map the head. */ - frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); - vec->sge[sge_no].iov_base = xdr->head[0].iov_base; - vec->sge[sge_no].iov_len = xdr->head[0].iov_len; - vec->count = 2; - sge_no++; - - /* Map the XDR head */ - frmr->kva = frva; - frmr->direction = DMA_TO_DEVICE; - frmr->access_flags = 0; - frmr->map_len = PAGE_SIZE; - frmr->page_list_len = 1; - page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, - virt_to_page(xdr->head[0].iov_base), - page_off, - PAGE_SIZE - page_off, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - atomic_inc(&xprt->sc_dma_used); - - /* Map the XDR page list */ - page_off = xdr->page_base; - page_bytes = xdr->page_len + page_off; - if (!page_bytes) - goto encode_tail; - - /* Map the pages */ - vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; - vec->sge[sge_no].iov_len = page_bytes; - sge_no++; - while (page_bytes) { - struct page *page; - - page = xdr->pages[page_no++]; - sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); - page_bytes -= sge_bytes; - - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, - page, page_off, - sge_bytes, DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - - atomic_inc(&xprt->sc_dma_used); - page_off = 0; /* reset for next time through loop */ - frmr->map_len += PAGE_SIZE; - frmr->page_list_len++; - } - vec->count++; - - encode_tail: - /* Map tail */ - if (0 == xdr->tail[0].iov_len) - goto done; - - vec->count++; - vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; - - if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == - ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { - /* - * If head and tail use the same page, we don't need - * to map it again. - */ - vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; - } else { - void *va; - - /* Map another page for the tail */ - page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; - va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); - vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; - - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va), - page_off, - PAGE_SIZE, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - atomic_inc(&xprt->sc_dma_used); - frmr->map_len += PAGE_SIZE; - frmr->page_list_len++; - } - - done: - if (svc_rdma_fastreg(xprt, frmr)) - goto fatal_err; - - return 0; - - fatal_err: - printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); - vec->frmr = NULL; - svc_rdma_put_frmr(xprt, frmr); - return -EIO; -} - static int map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) @@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt, BUG_ON(xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); - if (xprt->sc_frmr_pg_list_len) - return fast_reg_xdr(xprt, xdr, vec); - /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; @@ -282,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, } /* Assumptions: - * - We are using FRMR - * - or - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -327,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_bytes = min_t(size_t, bc, vec->sge[xdr_sge_no].iov_len-sge_off); sge[sge_no].length = sge_bytes; - if (!vec->frmr) { - sge[sge_no].addr = - dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - sge[sge_no].addr)) - goto err; - atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; - } else { - sge[sge_no].addr = (unsigned long) - vec->sge[xdr_sge_no].iov_base + sge_off; - sge[sge_no].lkey = vec->frmr->mr->lkey; - } + sge[sge_no].addr = + dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; ctxt->count++; - ctxt->frmr = vec->frmr; sge_off = 0; sge_no++; xdr_sge_no++; @@ -369,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, return 0; err: svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(xprt, vec->frmr); svc_rdma_put_context(ctxt, 0); /* Fatal error, close transport */ return -EIO; @@ -397,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - if (vec->frmr) - max_write = vec->frmr->map_len; - else - max_write = xprt->sc_max_sge * PAGE_SIZE; + max_write = xprt->sc_max_sge * PAGE_SIZE; /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; @@ -472,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - if (vec->frmr) - max_write = vec->frmr->map_len; - else - max_write = xprt->sc_max_sge * PAGE_SIZE; + max_write = xprt->sc_max_sge * PAGE_SIZE; /* xdr offset starts at RPC message */ nchunks = ntohl(arg_ary->wc_nchunks); @@ -545,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; - struct ib_send_wr inv_wr; int sge_no; int sge_bytes; int page_no; @@ -559,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma, "svcrdma: could not post a receive buffer, err=%d." "Closing transport %p.\n", ret, rdma); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_rdma_put_frmr(rdma, vec->frmr); svc_rdma_put_context(ctxt, 0); return -ENOTCONN; } @@ -567,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; - ctxt->frmr = vec->frmr; - if (vec->frmr) - set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); - else - clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_dma_lkey; @@ -590,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma, int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; - if (!vec->frmr) { - ctxt->sge[sge_no].addr = - dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(rdma->sc_cm_id->device, - ctxt->sge[sge_no].addr)) - goto err; - atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; - } else { - ctxt->sge[sge_no].addr = (unsigned long) - vec->sge[sge_no].iov_base; - ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; - } + ctxt->sge[sge_no].addr = + dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } BUG_ON(byte_count != 0); @@ -627,6 +450,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[page_no+1].length = 0; } rqstp->rq_next_page = rqstp->rq_respages + 1; + BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; @@ -635,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - if (vec->frmr) { - /* Prepare INVALIDATE WR */ - memset(&inv_wr, 0, sizeof inv_wr); - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = - vec->frmr->mr->lkey; - send_wr.next = &inv_wr; - } ret = svc_rdma_send(rdma, &send_wr); if (ret) @@ -653,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma, err: svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(rdma, vec->frmr); svc_rdma_put_context(ctxt, 1); return -EIO; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 25688fa2207f..e7323fbbd348 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -65,6 +66,7 @@ static void dto_tasklet_func(unsigned long data); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static int svc_rdma_secure_port(struct svc_rqst *); static void rq_cq_reap(struct svcxprt_rdma *xprt); static void sq_cq_reap(struct svcxprt_rdma *xprt); @@ -82,6 +84,7 @@ static struct svc_xprt_ops svc_rdma_ops = { .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, + .xpo_secure_port = svc_rdma_secure_port, }; struct svc_xprt_class svc_rdma_class = { @@ -160,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } map->count = 0; - map->frmr = NULL; return map; } @@ -336,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt, switch (ctxt->wr_op) { case IB_WR_SEND: - if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) - svc_rdma_put_frmr(xprt, ctxt->frmr); + BUG_ON(ctxt->frmr); svc_rdma_put_context(ctxt, 1); break; case IB_WR_RDMA_WRITE: + BUG_ON(ctxt->frmr); svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: + svc_rdma_put_frmr(xprt, ctxt->frmr); if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; BUG_ON(!read_hdr); - if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) - svc_rdma_put_frmr(xprt, ctxt->frmr); spin_lock_bh(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); list_add_tail(&read_hdr->dto_q, @@ -363,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt, break; default: + BUG_ON(1); printk(KERN_ERR "svcrdma: unexpected completion type, " "opcode=%d\n", ctxt->wr_op); @@ -378,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt, static void sq_cq_reap(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - struct ib_wc wc; + struct ib_wc wc_a[6]; + struct ib_wc *wc; struct ib_cq *cq = xprt->sc_sq_cq; int ret; + memset(wc_a, 0, sizeof(wc_a)); + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - if (wc.status != IB_WC_SUCCESS) - /* Close the transport */ - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) { + int i; - /* Decrement used SQ WR count */ - atomic_dec(&xprt->sc_sq_count); - wake_up(&xprt->sc_send_wait); + for (i = 0; i < ret; i++) { + wc = &wc_a[i]; + if (wc->status != IB_WC_SUCCESS) { + dprintk("svcrdma: sq wc err status %d\n", + wc->status); - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - if (ctxt) - process_context(xprt, ctxt); + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + } - svc_xprt_put(&xprt->sc_xprt); + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + ctxt = (struct svc_rdma_op_ctxt *) + (unsigned long)wc->wr_id; + if (ctxt) + process_context(xprt, ctxt); + + svc_xprt_put(&xprt->sc_xprt); + } } if (ctxt) @@ -993,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) need_dma_mr = 0; break; case RDMA_TRANSPORT_IB: - if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else if (!(devattr.device_cap_flags & + IB_DEVICE_LOCAL_DMA_LKEY)) { need_dma_mr = 1; dma_mr_acc = IB_ACCESS_LOCAL_WRITE; } else @@ -1190,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) container_of(xprt, struct svcxprt_rdma, sc_xprt); /* - * If there are fewer SQ WR available than required to send a - * simple response, return false. - */ - if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) - return 0; - - /* - * ...or there are already waiters on the SQ, + * If there are already waiters on the SQ, * return false. */ if (waitqueue_active(&rdma->sc_send_wait)) @@ -1207,6 +1219,11 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +static int svc_rdma_secure_port(struct svc_rqst *rqstp) +{ + return 1; +} + /* * Attempt to register the kvec representing the RPC memory with the * device. diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 25a3dcf15cae..402a7e9a16b7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -893,11 +893,11 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); xprt->reestablish_timeout = 0; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xprt_disconnect_done(xprt); } @@ -1497,12 +1497,12 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void xs_sock_mark_closed(struct rpc_xprt *xprt) @@ -1556,10 +1556,10 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: @@ -1578,9 +1578,9 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); break; case TCP_CLOSE: xs_tcp_cancel_linger_timeout(xprt); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 3aaf73de9e2d..ad844d365340 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -47,7 +47,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) int hdr_space = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN); u16 cmd; - if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN))) + if ((req_userhdr->cmd & 0xC000) && (!netlink_capable(skb, CAP_NET_ADMIN))) cmd = TIPC_CMD_NOT_NET_ADMIN; else cmd = req_userhdr->cmd; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bb7e8ba821f4..749f80c21e22 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1207,7 +1207,7 @@ restart: sk->sk_state = TCP_ESTABLISHED; sock_hold(newsk); - smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5adfd94c5b85..85d232bed87d 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1925,9 +1925,23 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; -static int __vsock_core_init(void) +int __vsock_core_init(const struct vsock_transport *t, struct module *owner) { - int err; + int err = mutex_lock_interruptible(&vsock_register_mutex); + + if (err) + return err; + + if (transport) { + err = -EBUSY; + goto err_busy; + } + + /* Transport must be the owner of the protocol so that it can't + * unload while there are open sockets. + */ + vsock_proto.owner = owner; + transport = t; vsock_init_tables(); @@ -1951,36 +1965,19 @@ static int __vsock_core_init(void) goto err_unregister_proto; } + mutex_unlock(&vsock_register_mutex); return 0; err_unregister_proto: proto_unregister(&vsock_proto); err_misc_deregister: misc_deregister(&vsock_device); - return err; -} - -int vsock_core_init(const struct vsock_transport *t) -{ - int retval = mutex_lock_interruptible(&vsock_register_mutex); - if (retval) - return retval; - - if (transport) { - retval = -EBUSY; - goto out; - } - - transport = t; - retval = __vsock_core_init(); - if (retval) - transport = NULL; - -out: + transport = NULL; +err_busy: mutex_unlock(&vsock_register_mutex); - return retval; + return err; } -EXPORT_SYMBOL_GPL(vsock_core_init); +EXPORT_SYMBOL_GPL(__vsock_core_init); void vsock_core_exit(void) { @@ -2000,5 +1997,5 @@ EXPORT_SYMBOL_GPL(vsock_core_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); -MODULE_VERSION("1.0.0.0-k"); +MODULE_VERSION("1.0.1.0-k"); MODULE_LICENSE("GPL v2"); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 7d09a712cb1f..88f108edfb58 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -284,14 +284,22 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy) } EXPORT_SYMBOL(cfg80211_sched_scan_results); -void cfg80211_sched_scan_stopped(struct wiphy *wiphy) +void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + ASSERT_RTNL(); + trace_cfg80211_sched_scan_stopped(wiphy); - rtnl_lock(); __cfg80211_stop_sched_scan(rdev, true); +} +EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl); + +void cfg80211_sched_scan_stopped(struct wiphy *wiphy) +{ + rtnl_lock(); + cfg80211_sched_scan_stopped_rtnl(wiphy); rtnl_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index acdcb4a81817..3546a77033de 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -234,7 +234,6 @@ void cfg80211_conn_work(struct work_struct *work) NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, false, NULL); - cfg80211_sme_free(wdev); } wdev_unlock(wdev); } @@ -648,6 +647,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wdev->wiphy, bss); } + cfg80211_sme_free(wdev); return; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f02f511b7107..c08fbd11ceff 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1842,7 +1842,7 @@ purge_queue: xfrm_pol_put(pol); } -static int xdst_queue_output(struct sk_buff *skb) +static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8f131c10a6f3..51398ae6cda8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2377,7 +2377,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (!netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || |