From 07a3061e0832fe22932e0fa977581e45b9c42431 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 3 Jul 2016 13:31:35 +0200 Subject: batman-adv: netlink: add routing_algo query BATADV_CMD_GET_ROUTING_ALGOS is used to get the list of supported routing algorithms. Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: Reduce the number of changes to BATADV_CMD_GET_ROUTING_ALGOS, fix includes] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 0fbf6fd4711b..7afce444a64e 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -73,6 +73,7 @@ enum batadv_nl_attrs { * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device * @BATADV_CMD_TP_METER: Start a tp meter session * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session + * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -81,6 +82,7 @@ enum batadv_nl_commands { BATADV_CMD_GET_MESH_INFO, BATADV_CMD_TP_METER, BATADV_CMD_TP_METER_CANCEL, + BATADV_CMD_GET_ROUTING_ALGOS, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 -- cgit v1.2.3 From b60620cf567b79da46096a0ba29b39f23b6e7f1c Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 3 Jul 2016 13:31:36 +0200 Subject: batman-adv: netlink: hardif query BATADV_CMD_GET_HARDIFS will return the list of hardifs (including index, name and MAC address) of all hardifs for a given softif. Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: Reduce the number of changes to BATADV_CMD_GET_HARDIFS, add policy for attributes] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 4 ++ net/batman-adv/netlink.c | 128 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 130 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 7afce444a64e..8abcbca14b6a 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -40,6 +40,7 @@ * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment + * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active * @__BATADV_ATTR_AFTER_LAST: internal use * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available * @BATADV_ATTR_MAX: highest attribute number currently defined @@ -60,6 +61,7 @@ enum batadv_nl_attrs { BATADV_ATTR_TPMETER_BYTES, BATADV_ATTR_TPMETER_COOKIE, BATADV_ATTR_PAD, + BATADV_ATTR_ACTIVE, /* add attributes above here, update the policy in netlink.c */ __BATADV_ATTR_AFTER_LAST, NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, @@ -74,6 +76,7 @@ enum batadv_nl_attrs { * @BATADV_CMD_TP_METER: Start a tp meter session * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. + * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -83,6 +86,7 @@ enum batadv_nl_commands { BATADV_CMD_TP_METER, BATADV_CMD_TP_METER_CANCEL, BATADV_CMD_GET_ROUTING_ALGOS, + BATADV_CMD_GET_HARDIFS, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 19fb2657e274..3f872d6eec57 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -26,10 +26,14 @@ #include #include #include +#include +#include +#include #include #include #include #include +#include #include #include "bat_algo.h" @@ -37,8 +41,6 @@ #include "soft-interface.h" #include "tp_meter.h" -struct sk_buff; - struct genl_family batadv_netlink_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -70,8 +72,24 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, + [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, }; +/** + * batadv_netlink_get_ifindex - Extract an interface index from a message + * @nlh: Message header + * @attrtype: Attribute which holds an interface index + * + * Return: interface index, or 0. + */ +static int +batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) +{ + struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); + + return attr ? nla_get_u32(attr) : 0; +} + /** * batadv_netlink_mesh_info_put - fill in generic information about mesh * interface @@ -381,6 +399,106 @@ out: return ret; } +/** + * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hard_iface: Hard interface to dump + * + * Return: error code, or 0 on success + */ +static int +batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *hard_iface) +{ + struct net_device *net_dev = hard_iface->net_dev; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_HARDIFS); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + net_dev->ifindex) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + net_dev->name) || + nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, + net_dev->dev_addr)) + goto nla_put_failure; + + if (hard_iface->if_status == BATADV_IF_ACTIVE) { + if (nla_put_flag(msg, BATADV_ATTR_ACTIVE)) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_netlink_dump_hardifs - Dump all hard interface into a messages + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: error code, or length of reply message on success + */ +static int +batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hard_iface *hard_iface; + int ifindex; + int portid = NETLINK_CB(cb->skb).portid; + int seq = cb->nlh->nlmsg_seq; + int skip = cb->args[0]; + int i = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface) + return -ENODEV; + + if (!batadv_softif_is_valid(soft_iface)) { + dev_put(soft_iface); + return -ENODEV; + } + + rcu_read_lock(); + + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + if (i++ < skip) + continue; + + if (batadv_netlink_dump_hardif_entry(msg, portid, seq, + hard_iface)) { + i--; + break; + } + } + + rcu_read_unlock(); + + dev_put(soft_iface); + + cb->args[0] = i; + + return msg->len; +} + static struct genl_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH_INFO, @@ -406,6 +524,12 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_algo_dump, }, + { + .cmd = BATADV_CMD_GET_HARDIFS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_netlink_dump_hardifs, + }, }; /** -- cgit v1.2.3 From d34f05507db245bef819b684ad84f9e0f9bb003d Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 3 Jul 2016 13:31:37 +0200 Subject: batman-adv: netlink: add translation table query This adds the commands BATADV_CMD_GET_TRANSTABLE_LOCAL and BATADV_CMD_GET_TRANSTABLE_GLOBAL, which correspond to the transtable_local and transtable_global debugfs files. The batadv_tt_client_flags enum is moved to the UAPI to expose it as part of the netlink API. Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: add policy for attributes, fix includes] Signed-off-by: Sven Eckelmann [sw@simonwunderlich.de: fix VID attributes content] Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 56 ++++++ net/batman-adv/netlink.c | 23 ++- net/batman-adv/netlink.h | 3 + net/batman-adv/packet.h | 36 ---- net/batman-adv/translation-table.c | 377 +++++++++++++++++++++++++++++++++++++ net/batman-adv/translation-table.h | 4 + 6 files changed, 462 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 8abcbca14b6a..1168d058cd2b 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -22,6 +22,42 @@ #define BATADV_NL_MCAST_GROUP_TPMETER "tpmeter" +/** + * enum batadv_tt_client_flags - TT client specific flags + * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table + * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new + * update telling its new real location has not been received/sent yet + * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. + * This information is used by the "AP Isolation" feature + * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This + * information is used by the Extended Isolation feature + * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table + * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has + * not been announced yet + * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept + * in the table for one more originator interval for consistency purposes + * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of + * the network but no nnode has already announced it + * + * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. + * Bits from 8 to 15 are called _local flags_ because they are used for local + * computations only. + * + * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with + * the other nodes in the network. To achieve this goal these flags are included + * in the TT CRC computation. + */ +enum batadv_tt_client_flags { + BATADV_TT_CLIENT_DEL = (1 << 0), + BATADV_TT_CLIENT_ROAM = (1 << 1), + BATADV_TT_CLIENT_WIFI = (1 << 4), + BATADV_TT_CLIENT_ISOLA = (1 << 5), + BATADV_TT_CLIENT_NOPURGE = (1 << 8), + BATADV_TT_CLIENT_NEW = (1 << 9), + BATADV_TT_CLIENT_PENDING = (1 << 10), + BATADV_TT_CLIENT_TEMP = (1 << 11), +}; + /** * enum batadv_nl_attrs - batman-adv netlink attributes * @@ -41,6 +77,14 @@ * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active + * @BATADV_ATTR_TT_ADDRESS: Client MAC address + * @BATADV_ATTR_TT_TTVN: Translation table version + * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version + * @BATADV_ATTR_TT_CRC32: CRC32 over translation table + * @BATADV_ATTR_TT_VID: VLAN ID + * @BATADV_ATTR_TT_FLAGS: Translation table client flags + * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best + * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen * @__BATADV_ATTR_AFTER_LAST: internal use * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available * @BATADV_ATTR_MAX: highest attribute number currently defined @@ -62,6 +106,14 @@ enum batadv_nl_attrs { BATADV_ATTR_TPMETER_COOKIE, BATADV_ATTR_PAD, BATADV_ATTR_ACTIVE, + BATADV_ATTR_TT_ADDRESS, + BATADV_ATTR_TT_TTVN, + BATADV_ATTR_TT_LAST_TTVN, + BATADV_ATTR_TT_CRC32, + BATADV_ATTR_TT_VID, + BATADV_ATTR_TT_FLAGS, + BATADV_ATTR_FLAG_BEST, + BATADV_ATTR_LAST_SEEN_MSECS, /* add attributes above here, update the policy in netlink.c */ __BATADV_ATTR_AFTER_LAST, NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, @@ -77,6 +129,8 @@ enum batadv_nl_attrs { * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces + * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations + * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -87,6 +141,8 @@ enum batadv_nl_commands { BATADV_CMD_TP_METER_CANCEL, BATADV_CMD_GET_ROUTING_ALGOS, BATADV_CMD_GET_HARDIFS, + BATADV_CMD_GET_TRANSTABLE_LOCAL, + BATADV_CMD_GET_TRANSTABLE_GLOBAL, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 3f872d6eec57..14360ec16513 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -40,6 +40,7 @@ #include "hard-interface.h" #include "soft-interface.h" #include "tp_meter.h" +#include "translation-table.h" struct genl_family batadv_netlink_family = { .id = GENL_ID_GENERATE, @@ -73,6 +74,14 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, + [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, + [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, + [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, }; /** @@ -82,7 +91,7 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { * * Return: interface index, or 0. */ -static int +int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); @@ -530,6 +539,18 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_netlink_dump_hardifs, }, + { + .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_tt_local_dump, + }, + { + .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_tt_global_dump, + }, }; /** diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index b399f49504df..52eb16281aba 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -23,8 +23,11 @@ #include #include +struct nlmsghdr; + void batadv_netlink_register(void); void batadv_netlink_unregister(void); +int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype); int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 6b011ff64dd8..6afc0b86950e 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -128,42 +128,6 @@ enum batadv_tt_data_flags { BATADV_TT_FULL_TABLE = BIT(4), }; -/** - * enum batadv_tt_client_flags - TT client specific flags - * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table - * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new - * update telling its new real location has not been received/sent yet - * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. - * This information is used by the "AP Isolation" feature - * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This - * information is used by the Extended Isolation feature - * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table - * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has - * not been announced yet - * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept - * in the table for one more originator interval for consistency purposes - * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of - * the network but no nnode has already announced it - * - * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. - * Bits from 8 to 15 are called _local flags_ because they are used for local - * computations only. - * - * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with - * the other nodes in the network. To achieve this goal these flags are included - * in the TT CRC computation. - */ -enum batadv_tt_client_flags { - BATADV_TT_CLIENT_DEL = BIT(0), - BATADV_TT_CLIENT_ROAM = BIT(1), - BATADV_TT_CLIENT_WIFI = BIT(4), - BATADV_TT_CLIENT_ISOLA = BIT(5), - BATADV_TT_CLIENT_NOPURGE = BIT(8), - BATADV_TT_CLIENT_NEW = BIT(9), - BATADV_TT_CLIENT_PENDING = BIT(10), - BATADV_TT_CLIENT_TEMP = BIT(11), -}; - /** * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index af2bfef6dca8..20804078293c 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -37,20 +37,27 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "originator.h" #include "packet.h" #include "soft-interface.h" @@ -1108,6 +1115,164 @@ out: return 0; } +/** + * batadv_tt_local_dump_entry - Dump one TT local entry into a message + * @msg :Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @common: tt local & tt global common data + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_tt_common_entry *common) +{ + void *hdr; + struct batadv_softif_vlan *vlan; + struct batadv_tt_local_entry *local; + unsigned int last_seen_msecs; + u32 crc; + + local = container_of(common, struct batadv_tt_local_entry, common); + last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); + + vlan = batadv_softif_vlan_get(bat_priv, common->vid); + if (!vlan) + return 0; + + crc = vlan->tt.crc; + + batadv_softif_vlan_put(vlan); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, + BATADV_CMD_GET_TRANSTABLE_LOCAL); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || + nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || + nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + goto nla_put_failure; + + if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) && + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @head: Pointer to the list containing the local tt entries + * @idx_s: Number of entries to skip + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct hlist_head *head, int *idx_s) +{ + struct batadv_tt_common_entry *common; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(common, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv, + common)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + return 0; +} + +/** + * batadv_tt_local_dump - Dump TT local entries into a message + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: Error code, or 0 on success + */ +int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_hashtable *hash; + struct hlist_head *head; + int ret; + int ifindex; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hash = bat_priv->tt.local_hash; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, head, &idx)) + break; + + bucket++; + } + + ret = msg->len; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + cb->args[0] = bucket; + cb->args[1] = idx; + + return ret; +} + static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, @@ -1755,6 +1920,218 @@ out: return 0; } +/** + * batadv_tt_global_dump_subentry - Dump all TT local entries into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @common: tt local & tt global common data + * @orig: Originator node announcing a non-mesh client + * @best: Is the best originator for the TT entry + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_tt_common_entry *common, + struct batadv_tt_orig_list_entry *orig, + bool best) +{ + void *hdr; + struct batadv_orig_node_vlan *vlan; + u8 last_ttvn; + u32 crc; + + vlan = batadv_orig_node_vlan_get(orig->orig_node, + common->vid); + if (!vlan) + return 0; + + crc = vlan->tt.crc; + + batadv_orig_node_vlan_put(vlan); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, + BATADV_CMD_GET_TRANSTABLE_GLOBAL); + if (!hdr) + return -ENOBUFS; + + last_ttvn = atomic_read(&orig->orig_node->last_ttvn); + + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || + nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig->orig_node->orig) || + nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) || + nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || + nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || + nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_tt_global_dump_entry - Dump one TT global entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @common: tt local & tt global common data + * @sub_s: Number of entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_tt_common_entry *common, int *sub_s) +{ + struct batadv_tt_orig_list_entry *orig_entry, *best_entry; + struct batadv_tt_global_entry *global; + struct hlist_head *head; + int sub = 0; + bool best; + + global = container_of(common, struct batadv_tt_global_entry, common); + best_entry = batadv_transtable_best_orig(bat_priv, global); + head = &global->orig_list; + + hlist_for_each_entry_rcu(orig_entry, head, list) { + if (sub++ < *sub_s) + continue; + + best = (orig_entry == best_entry); + + if (batadv_tt_global_dump_subentry(msg, portid, seq, common, + orig_entry, best)) { + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + *sub_s = 0; + return 0; +} + +/** + * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @head: Pointer to the list containing the global tt entries + * @idx_s: Number of entries to skip + * @sub: Number of entries to skip + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_tt_common_entry *common; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(common, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv, + common, sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_tt_global_dump - Dump TT global entries into a message + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: Error code, or length of message on success + */ +int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_hashtable *hash; + struct hlist_head *head; + int ret; + int ifindex; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hash = bat_priv->tt.global_hash; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_tt_global_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, bat_priv, + head, &idx, &sub)) + break; + + bucket++; + } + + ret = msg->len; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; + + return ret; +} + /** * _batadv_tt_global_del_orig_entry - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 02b0f85527cc..783fdba84db2 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -22,8 +22,10 @@ #include +struct netlink_callback; struct net_device; struct seq_file; +struct sk_buff; int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, @@ -33,6 +35,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const char *message, bool roaming); int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); +int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); +int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); -- cgit v1.2.3 From 85cf8c859d53f6f53c37bb7f23a41f6171427021 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 3 Jul 2016 13:31:39 +0200 Subject: batman-adv: netlink: add originator and neighbor table queries Add BATADV_CMD_GET_ORIGINATORS and BATADV_CMD_GET_NEIGHBORS commands, using handlers bat_orig_dump and bat_neigh_dump in batadv_algo_ops. Will always return -EOPNOTSUPP for now, as no implementations exist yet. Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven@narfation.org: Rewrite based on new algo_ops structures] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 4 + net/batman-adv/netlink.c | 13 ++++ net/batman-adv/originator.c | 160 ++++++++++++++++++++++++++++++++++++++++ net/batman-adv/originator.h | 4 + net/batman-adv/types.h | 9 +++ 5 files changed, 190 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 1168d058cd2b..3f7a415f5e09 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -131,6 +131,8 @@ enum batadv_nl_attrs { * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations + * @BATADV_CMD_GET_ORIGINATORS: Query list of originators + * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -143,6 +145,8 @@ enum batadv_nl_commands { BATADV_CMD_GET_HARDIFS, BATADV_CMD_GET_TRANSTABLE_LOCAL, BATADV_CMD_GET_TRANSTABLE_GLOBAL, + BATADV_CMD_GET_ORIGINATORS, + BATADV_CMD_GET_NEIGHBORS, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0c0d20bb92ce..8469fc4ec5a3 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -39,6 +39,7 @@ #include "bat_algo.h" #include "hard-interface.h" +#include "originator.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" @@ -554,6 +555,18 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_tt_global_dump, }, + { + .cmd = BATADV_CMD_GET_ORIGINATORS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_orig_dump, + }, + { + .cmd = BATADV_CMD_GET_NEIGHBORS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_hardif_neigh_dump, + }, }; /** diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 3940b5d24421..95c85558c530 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -28,11 +28,15 @@ #include #include #include +#include #include #include +#include #include #include #include +#include +#include #include "bat_algo.h" #include "distributed-arp-table.h" @@ -42,8 +46,10 @@ #include "hash.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "network-coding.h" #include "routing.h" +#include "soft-interface.h" #include "translation-table.h" /* hash class keys */ @@ -720,6 +726,83 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) return 0; } +/** + * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific + * outgoing interface + * @msg: message to dump into + * @cb: parameters for the dump + * + * Return: 0 or error value + */ +int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct net_device *hard_iface = NULL; + struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + int ret; + int ifindex, hard_ifindex; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (hard_ifindex) { + hard_iface = dev_get_by_index(net, hard_ifindex); + if (hard_iface) + hardif = batadv_hardif_get_by_netdev(hard_iface); + + if (!hardif) { + ret = -ENODEV; + goto out; + } + + if (hardif->soft_iface != soft_iface) { + ret = -ENOENT; + goto out; + } + } + + if (!bat_priv->algo_ops->neigh.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif); + + ret = msg->len; + + out: + if (hardif) + batadv_hardif_put(hardif); + if (hard_iface) + dev_put(hard_iface); + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + /** * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for * free after rcu grace period @@ -1330,6 +1413,83 @@ out: return 0; } +/** + * batadv_orig_dump - Dump to netlink the originator infos for a specific + * outgoing interface + * @msg: message to dump into + * @cb: parameters for the dump + * + * Return: 0 or error value + */ +int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct net_device *hard_iface = NULL; + struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + int ret; + int ifindex, hard_ifindex; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (hard_ifindex) { + hard_iface = dev_get_by_index(net, hard_ifindex); + if (hard_iface) + hardif = batadv_hardif_get_by_netdev(hard_iface); + + if (!hardif) { + ret = -ENODEV; + goto out; + } + + if (hardif->soft_iface != soft_iface) { + ret = -ENOENT; + goto out; + } + } + + if (!bat_priv->algo_ops->orig.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif); + + ret = msg->len; + + out: + if (hardif) + batadv_hardif_put(hardif); + if (hard_iface) + dev_put(hard_iface); + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) { diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 566306bf05dc..ebc56183f358 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -31,7 +31,9 @@ #include "hash.h" +struct netlink_callback; struct seq_file; +struct sk_buff; bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); @@ -61,6 +63,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); +int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); struct batadv_orig_ifinfo * @@ -72,6 +75,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); +int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 72806a3c40df..968023a61598 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -28,6 +28,7 @@ #include #include #include +#include #include /* for linux/wait.h */ #include #include @@ -1418,6 +1419,7 @@ struct batadv_algo_iface_ops { * @is_similar_or_better: check if neigh1 is equally similar or better than * neigh2 for their respective outgoing interface from the metric prospective * @print: print the single hop neighbor list (optional) + * @dump: dump neighbors to a netlink socket (optional) */ struct batadv_algo_neigh_ops { void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); @@ -1430,6 +1432,9 @@ struct batadv_algo_neigh_ops { struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); void (*print)(struct batadv_priv *priv, struct seq_file *seq); + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv, + struct batadv_hard_iface *hard_iface); }; /** @@ -1441,6 +1446,7 @@ struct batadv_algo_neigh_ops { * @del_if: ask the routing algorithm to apply the needed changes to the * orig_node due to an hard-interface being removed from the mesh (optional) * @print: print the originator table (optional) + * @dump: dump originators to a netlink socket (optional) */ struct batadv_algo_orig_ops { void (*free)(struct batadv_orig_node *orig_node); @@ -1449,6 +1455,9 @@ struct batadv_algo_orig_ops { int del_if_num); void (*print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv, + struct batadv_hard_iface *hard_iface); }; /** -- cgit v1.2.3 From 024f99cb4acc14dab5e55e1ecdf6aad31269ca98 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 3 Jul 2016 13:31:40 +0200 Subject: batman-adv: add B.A.T.M.A.N. IV bat_{orig, neigh}_dump implementations Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: Fix function parameter alignments, add policy for attributes, fix includes, fix algo_ops integration] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 4 + net/batman-adv/bat_iv_ogm.c | 366 ++++++++++++++++++++++++++++++++++++++++ net/batman-adv/netlink.c | 2 + 3 files changed, 372 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 3f7a415f5e09..ba2359a1f464 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -85,6 +85,8 @@ enum batadv_tt_client_flags { * @BATADV_ATTR_TT_FLAGS: Translation table client flags * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen + * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address + * @BATADV_ATTR_TQ: TQ to neighbour * @__BATADV_ATTR_AFTER_LAST: internal use * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available * @BATADV_ATTR_MAX: highest attribute number currently defined @@ -114,6 +116,8 @@ enum batadv_nl_attrs { BATADV_ATTR_TT_FLAGS, BATADV_ATTR_FLAG_BEST, BATADV_ATTR_LAST_SEEN_MSECS, + BATADV_ATTR_NEIGH_ADDRESS, + BATADV_ATTR_TQ, /* add attributes above here, update the policy in netlink.c */ __BATADV_ATTR_AFTER_LAST, NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a40cdf273625..7a8c0f63e2ae 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,9 @@ #include #include #include +#include +#include +#include #include "bat_algo.h" #include "bitarray.h" @@ -55,6 +59,7 @@ #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "network-coding.h" #include "originator.h" #include "packet.h" @@ -1947,6 +1952,235 @@ next: seq_puts(seq, "No batman nodes in range ...\n"); } +/** + * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a + * given outgoing interface. + * @neigh_node: Neighbour of interest + * @if_outgoing: Outgoing interface of interest + * @tq_avg: Pointer of where to store the TQ average + * + * Return: False if no average TQ available, otherwise true. + */ +static bool +batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_outgoing, + u8 *tq_avg) +{ + struct batadv_neigh_ifinfo *n_ifinfo; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + return false; + + *tq_avg = n_ifinfo->bat_iv.tq_avg; + batadv_neigh_ifinfo_put(n_ifinfo); + + return true; +} + +/** + * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @neigh_node: Single hops neighbour + * @best: Is the best originator + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + bool best) +{ + void *hdr; + u8 tq_avg; + unsigned int last_seen_msecs; + + last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); + + if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) + return 0; + + if (if_outgoing != BATADV_IF_DEFAULT && + if_outgoing != neigh_node->if_incoming) + return 0; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig_node->orig) || + nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + neigh_node->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + neigh_node->if_incoming->net_dev->ifindex) || + nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @sub_s: Number of sub entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, int *sub_s) +{ + struct batadv_neigh_node *neigh_node_best; + struct batadv_neigh_node *neigh_node; + int sub = 0; + bool best; + u8 tq_avg_best; + + neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); + if (!neigh_node_best) + goto out; + + if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, + &tq_avg_best)) + goto out; + + if (tq_avg_best == 0) + goto out; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + if (sub++ < *sub_s) + continue; + + best = (neigh_node == neigh_node_best); + + if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, + bat_priv, if_outgoing, + orig_node, neigh_node, + best)) { + batadv_neigh_node_put(neigh_node_best); + + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + out: + if (neigh_node_best) + batadv_neigh_node_put(neigh_node_best); + + *sub_s = 0; + return 0; +} + +/** + * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @head: Bucket to be dumped + * @idx_s: Number of entries to be skipped + * @sub: Number of sub entries to be skipped + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_orig_node *orig_node; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, + sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_iv_ogm_orig_dump - Dump the originators into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + */ +static void +batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_iv_ogm_orig_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, if_outgoing, head, + &idx, &sub)) + break; + + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; +} + /** * batadv_iv_hardif_neigh_print - print a single hop neighbour node * @seq: neighbour table seq_file struct @@ -2043,6 +2277,136 @@ out: return ret; } +/** + * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hardif_neigh: Neighbour to be dumped + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + void *hdr; + unsigned int last_seen_msecs; + + last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + hardif_neigh->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hardif_neigh->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface + * into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @hard_iface: Hard interface to dump the neighbours for + * @idx_s: Number of entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + int *idx_s) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + int idx = 0; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (idx++ < *idx_s) + continue; + + if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, + hardif_neigh)) { + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + + *idx_s = 0; + return 0; +} + +/** + * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @single_hardif: Limit dump to this hard interfaace + */ +static void +batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *single_hardif) +{ + struct batadv_hard_iface *hard_iface; + int i_hardif = 0; + int i_hardif_s = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + rcu_read_lock(); + if (single_hardif) { + if (i_hardif_s == 0) { + if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, + single_hardif, + &idx) == 0) + i_hardif++; + } + } else { + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, + list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (i_hardif++ < i_hardif_s) + continue; + + if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, + hard_iface, &idx)) { + i_hardif--; + break; + } + } + } + rcu_read_unlock(); + + cb->args[0] = i_hardif; + cb->args[1] = idx; +} + /** * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison @@ -2330,9 +2694,11 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .print = batadv_iv_neigh_print, + .dump = batadv_iv_ogm_neigh_dump, }, .orig = { .print = batadv_iv_ogm_orig_print, + .dump = batadv_iv_ogm_orig_dump, .free = batadv_iv_ogm_orig_free, .add_if = batadv_iv_ogm_orig_add_if, .del_if = batadv_iv_ogm_orig_del_if, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 8469fc4ec5a3..0c7940cc1968 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -84,6 +84,8 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, + [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TQ] = { .type = NLA_U8 }, }; /** -- cgit v1.2.3 From f02a478f518ee5690f279c8c2d3a6222143a7b20 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 3 Jul 2016 13:31:41 +0200 Subject: batman-adv: add B.A.T.M.A.N. V bat_{orig, neigh}_dump implementations Dump the algo V originators and neighbours. Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven@narfation.org: Fix includes, fix algo_ops integration] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 2 + net/batman-adv/bat_v.c | 340 ++++++++++++++++++++++++++++++++++++++++ net/batman-adv/netlink.c | 1 + 3 files changed, 343 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index ba2359a1f464..2e2747fb1311 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -87,6 +87,7 @@ enum batadv_tt_client_flags { * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address * @BATADV_ATTR_TQ: TQ to neighbour + * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour * @__BATADV_ATTR_AFTER_LAST: internal use * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available * @BATADV_ATTR_MAX: highest attribute number currently defined @@ -118,6 +119,7 @@ enum batadv_nl_attrs { BATADV_ATTR_LAST_SEEN_MSECS, BATADV_ATTR_NEIGH_ADDRESS, BATADV_ATTR_TQ, + BATADV_ATTR_THROUGHPUT, /* add attributes above here, update the policy in netlink.c */ __BATADV_ATTR_AFTER_LAST, NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 1d777b171366..9dccfaf32115 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -22,17 +22,22 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #include +#include +#include +#include #include "bat_algo.h" #include "bat_v_elp.h" @@ -42,9 +47,12 @@ #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" +struct sk_buff; + static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); @@ -205,6 +213,138 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv, seq_puts(seq, "No batman nodes in range ...\n"); } +/** + * batadv_v_neigh_dump_neigh - Dump a neighbour into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hardif_neigh: Neighbour to dump + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + void *hdr; + unsigned int last_seen_msecs; + u32 throughput; + + last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); + throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + throughput = throughput * 100; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_NEIGHBORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + hardif_neigh->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hardif_neigh->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs) || + nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_v_neigh_dump_hardif - Dump the neighbours of a hard interface into + * a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @hard_iface: The hard interface to be dumped + * @idx_s: Entries to be skipped + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + int *idx_s) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + int idx = 0; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (idx++ < *idx_s) + continue; + + if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + + *idx_s = 0; + return 0; +} + +/** + * batadv_v_neigh_dump - Dump the neighbours of a hard interface into a + * message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @single_hardif: Limit dumping to this hard interface + */ +static void +batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *single_hardif) +{ + struct batadv_hard_iface *hard_iface; + int i_hardif = 0; + int i_hardif_s = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + rcu_read_lock(); + if (single_hardif) { + if (i_hardif_s == 0) { + if (batadv_v_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, single_hardif, + &idx) == 0) + i_hardif++; + } + } else { + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (i_hardif++ < i_hardif_s) + continue; + + if (batadv_v_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, hard_iface, + &idx)) { + i_hardif--; + break; + } + } + } + rcu_read_unlock(); + + cb->args[0] = i_hardif; + cb->args[1] = idx; +} + /** * batadv_v_orig_print - print the originator table * @bat_priv: the bat priv with all the soft interface information @@ -272,6 +412,204 @@ next: seq_puts(seq, "No batman nodes in range ...\n"); } +/** + * batadv_v_orig_dump_subentry - Dump an originator subentry into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @neigh_node: Single hops neighbour + * @best: Is the best originator + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + bool best) +{ + struct batadv_neigh_ifinfo *n_ifinfo; + unsigned int last_seen_msecs; + u32 throughput; + void *hdr; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + return 0; + + throughput = n_ifinfo->bat_v.throughput * 100; + + batadv_neigh_ifinfo_put(n_ifinfo); + + last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); + + if (if_outgoing != BATADV_IF_DEFAULT && + if_outgoing != neigh_node->if_incoming) + return 0; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_ORIGINATORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || + nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + neigh_node->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + neigh_node->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_v_orig_dump_entry - Dump an originator entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @sub_s: Number of sub entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, int *sub_s) +{ + struct batadv_neigh_node *neigh_node_best; + struct batadv_neigh_node *neigh_node; + int sub = 0; + bool best; + + neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); + if (!neigh_node_best) + goto out; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + if (sub++ < *sub_s) + continue; + + best = (neigh_node == neigh_node_best); + + if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, + neigh_node, best)) { + batadv_neigh_node_put(neigh_node_best); + + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + out: + if (neigh_node_best) + batadv_neigh_node_put(neigh_node_best); + + *sub_s = 0; + return 0; +} + +/** + * batadv_v_orig_dump_bucket - Dump an originator bucket into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @head: Bucket to be dumped + * @idx_s: Number of entries to be skipped + * @sub: Number of sub entries to be skipped + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_orig_node *orig_node; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_v_orig_dump - Dump the originators into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + */ +static void +batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_v_orig_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, if_outgoing, head, &idx, + &sub)) + break; + + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; +} + static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, @@ -573,9 +911,11 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, .print = batadv_v_neigh_print, + .dump = batadv_v_neigh_dump, }, .orig = { .print = batadv_v_orig_print, + .dump = batadv_v_orig_dump, }, .gw = { .store_sel_class = batadv_v_store_sel_class, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0c7940cc1968..025f2ec2b27e 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -86,6 +86,7 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TQ] = { .type = NLA_U8 }, + [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, }; /** -- cgit v1.2.3 From d7129dafcb71adfd1a166d0477ce0f564cf410d5 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Jul 2016 13:31:42 +0200 Subject: batman-adv: netlink: add gateway table queries Add BATADV_CMD_GET_GATEWAYS commands, using handlers bat_gw_dump in batadv_algo_ops. Will always return -EOPNOTSUPP for now, as no implementations exist yet. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 8 ++++++ net/batman-adv/gateway_client.c | 59 +++++++++++++++++++++++++++++++++++++++++ net/batman-adv/gateway_client.h | 2 ++ net/batman-adv/netlink.c | 10 +++++++ net/batman-adv/types.h | 3 +++ 5 files changed, 82 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 2e2747fb1311..a13fc09e8192 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -88,6 +88,9 @@ enum batadv_tt_client_flags { * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address * @BATADV_ATTR_TQ: TQ to neighbour * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour + * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth + * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth + * @BATADV_ATTR_ROUTER: Gateway router MAC address * @__BATADV_ATTR_AFTER_LAST: internal use * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available * @BATADV_ATTR_MAX: highest attribute number currently defined @@ -120,6 +123,9 @@ enum batadv_nl_attrs { BATADV_ATTR_NEIGH_ADDRESS, BATADV_ATTR_TQ, BATADV_ATTR_THROUGHPUT, + BATADV_ATTR_BANDWIDTH_UP, + BATADV_ATTR_BANDWIDTH_DOWN, + BATADV_ATTR_ROUTER, /* add attributes above here, update the policy in netlink.c */ __BATADV_ATTR_AFTER_LAST, NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, @@ -139,6 +145,7 @@ enum batadv_nl_attrs { * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations * @BATADV_CMD_GET_ORIGINATORS: Query list of originators * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours + * @BATADV_CMD_GET_GATEWAYS: Query list of gateways * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -153,6 +160,7 @@ enum batadv_nl_commands { BATADV_CMD_GET_TRANSTABLE_GLOBAL, BATADV_CMD_GET_ORIGINATORS, BATADV_CMD_GET_NEIGHBORS, + BATADV_CMD_GET_GATEWAYS, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index a77a17939f1e..c2928c2287b8 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -39,13 +41,17 @@ #include #include #include +#include +#include #include "gateway_common.h" #include "hard-interface.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" #include "routing.h" +#include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -500,6 +506,59 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) return 0; } +/** + * batadv_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * + * Return: Error code, or length of message + */ +int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + int ret; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + if (!bat_priv->algo_ops->gw.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->gw.dump(msg, cb, bat_priv); + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + /** * batadv_gw_dhcp_recipient_get - check if a packet is a DHCP message * @skb: the packet to check diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 6b40432aa1ed..859166d03561 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -23,6 +23,7 @@ #include struct batadv_tvlv_gateway_data; +struct netlink_callback; struct seq_file; struct sk_buff; @@ -43,6 +44,7 @@ void batadv_gw_node_put(struct batadv_gw_node *gw_node); struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); +int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 025f2ec2b27e..c68ccb03634d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -38,6 +38,7 @@ #include #include "bat_algo.h" +#include "gateway_client.h" #include "hard-interface.h" #include "originator.h" #include "soft-interface.h" @@ -87,6 +88,9 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TQ] = { .type = NLA_U8 }, [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, }; /** @@ -570,6 +574,12 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_hardif_neigh_dump, }, + { + .cmd = BATADV_CMD_GET_GATEWAYS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_gw_dump, + }, }; /** diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 968023a61598..b5f01a36ec34 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1469,6 +1469,7 @@ struct batadv_algo_orig_ops { * @is_eligible: check if a newly discovered GW is a potential candidate for * the election as best GW (optional) * @print: print the gateway table (optional) + * @dump: dump gateways to a netlink socket (optional) */ struct batadv_algo_gw_ops { ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, @@ -1480,6 +1481,8 @@ struct batadv_algo_gw_ops { struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node); void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq); + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv); }; /** -- cgit v1.2.3 From 04f3f5bf1883fbe0acba5c1fc698cf5cedebc5c5 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 3 Jul 2016 13:31:45 +0200 Subject: batman-adv: add B.A.T.M.A.N. Dump BLA claims via netlink Dump the list of bridge loop avoidance claims via the netlink socket. Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: add policy for attributes, fix includes, fix soft_iface reference leak] Signed-off-by: Sven Eckelmann [sw@simonwunderlich.de: fix kerneldoc, fix error reporting] Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 12 +++ net/batman-adv/bridge_loop_avoidance.c | 169 +++++++++++++++++++++++++++++++++ net/batman-adv/bridge_loop_avoidance.h | 10 +- net/batman-adv/netlink.c | 12 +++ 4 files changed, 202 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index a13fc09e8192..96b37ab2e840 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -91,6 +91,11 @@ enum batadv_tt_client_flags { * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth * @BATADV_ATTR_ROUTER: Gateway router MAC address + * @BATADV_ATTR_BLA_OWN: Flag indicating own originator + * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address + * @BATADV_ATTR_BLA_VID: BLA VLAN ID + * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address + * @BATADV_ATTR_BLA_CRC: BLA CRC * @__BATADV_ATTR_AFTER_LAST: internal use * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available * @BATADV_ATTR_MAX: highest attribute number currently defined @@ -126,6 +131,11 @@ enum batadv_nl_attrs { BATADV_ATTR_BANDWIDTH_UP, BATADV_ATTR_BANDWIDTH_DOWN, BATADV_ATTR_ROUTER, + BATADV_ATTR_BLA_OWN, + BATADV_ATTR_BLA_ADDRESS, + BATADV_ATTR_BLA_VID, + BATADV_ATTR_BLA_BACKBONE, + BATADV_ATTR_BLA_CRC, /* add attributes above here, update the policy in netlink.c */ __BATADV_ATTR_AFTER_LAST, NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, @@ -146,6 +156,7 @@ enum batadv_nl_attrs { * @BATADV_CMD_GET_ORIGINATORS: Query list of originators * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours * @BATADV_CMD_GET_GATEWAYS: Query list of gateways + * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -161,6 +172,7 @@ enum batadv_nl_commands { BATADV_CMD_GET_ORIGINATORS, BATADV_CMD_GET_NEIGHBORS, BATADV_CMD_GET_GATEWAYS, + BATADV_CMD_GET_BLA_CLAIM, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index c75ef648f0fd..aafa88f3e98d 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -45,12 +46,18 @@ #include #include #include +#include +#include +#include +#include #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" +#include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -2051,6 +2058,168 @@ out: return 0; } +/** + * batadv_bla_claim_dump_entry - dump one entry of the claim table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @claim: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct batadv_bla_claim *claim) +{ + u8 *primary_addr = primary_if->net_dev->dev_addr; + u16 backbone_crc; + bool is_own; + void *hdr; + int ret = -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + is_own = batadv_compare_eth(claim->backbone_gw->orig, + primary_addr); + + spin_lock_bh(&claim->backbone_gw->crc_lock); + backbone_crc = claim->backbone_gw->crc; + spin_unlock_bh(&claim->backbone_gw->crc_lock); + + if (is_own) + if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || + nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || + nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, + claim->backbone_gw->orig) || + nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + backbone_crc)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + return ret; +} + +/** + * batadv_bla_claim_dump_bucket - dump one bucket of the claim table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: always 0. + */ +static int +batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_bla_claim *claim; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(claim, head, hash_entry) { + if (idx++ < *idx_skip) + continue; + if (batadv_bla_claim_dump_entry(msg, portid, seq, + primary_if, claim)) { + *idx_skip = idx - 1; + goto unlock; + } + } + + *idx_skip = idx; +unlock: + rcu_read_unlock(); + return 0; +} + +/** + * batadv_bla_claim_dump - dump claim table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->bla.claim_hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_bla_claim_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + primary_if, head, &idx)) + break; + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + /** * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq * file diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 0f01daeb359e..a80b9e96f28e 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -23,6 +23,7 @@ #include struct net_device; +struct netlink_callback; struct seq_file; struct sk_buff; @@ -35,6 +36,7 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, @@ -47,7 +49,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); - +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ @@ -112,6 +114,12 @@ static inline void batadv_bla_free(struct batadv_priv *bat_priv) { } +static inline int batadv_bla_claim_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index c68ccb03634d..b33675cbaecf 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -38,6 +38,7 @@ #include #include "bat_algo.h" +#include "bridge_loop_avoidance.h" #include "gateway_client.h" #include "hard-interface.h" #include "originator.h" @@ -91,6 +92,11 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, + [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, }; /** @@ -580,6 +586,12 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_gw_dump, }, + { + .cmd = BATADV_CMD_GET_BLA_CLAIM, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_bla_claim_dump, + }, }; /** -- cgit v1.2.3 From ea4152e1171604f325f1a5f080190823a0edbc1f Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Sun, 3 Jul 2016 13:31:47 +0200 Subject: batman-adv: add backbone table netlink support Dump the list of bridge loop avoidance backbones via the netlink socket. Signed-off-by: Simon Wunderlich Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- include/uapi/linux/batman_adv.h | 2 + net/batman-adv/bridge_loop_avoidance.c | 164 +++++++++++++++++++++++++++++++++ net/batman-adv/bridge_loop_avoidance.h | 7 ++ net/batman-adv/netlink.c | 7 ++ 4 files changed, 180 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 96b37ab2e840..734fe83ab645 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -157,6 +157,7 @@ enum batadv_nl_attrs { * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours * @BATADV_CMD_GET_GATEWAYS: Query list of gateways * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims + * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones * @__BATADV_CMD_AFTER_LAST: internal use * @BATADV_CMD_MAX: highest used command number */ @@ -173,6 +174,7 @@ enum batadv_nl_commands { BATADV_CMD_GET_NEIGHBORS, BATADV_CMD_GET_GATEWAYS, BATADV_CMD_GET_BLA_CLAIM, + BATADV_CMD_GET_BLA_BACKBONE, /* add new commands above here */ __BATADV_CMD_AFTER_LAST, BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index aafa88f3e98d..35ed1d32bab5 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2283,3 +2283,167 @@ out: batadv_hardif_put(primary_if); return 0; } + +/** + * batadv_bla_backbone_dump_entry - dump one entry of the backbone table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @backbone_gw: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct batadv_bla_backbone_gw *backbone_gw) +{ + u8 *primary_addr = primary_if->net_dev->dev_addr; + u16 backbone_crc; + bool is_own; + int msecs; + void *hdr; + int ret = -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); + + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + + msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); + + if (is_own) + if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, + backbone_gw->orig) || + nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) || + nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + backbone_crc) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + return ret; +} + +/** + * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: always 0. + */ +static int +batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_bla_backbone_gw *backbone_gw; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { + if (idx++ < *idx_skip) + continue; + if (batadv_bla_backbone_dump_entry(msg, portid, seq, + primary_if, backbone_gw)) { + *idx_skip = idx - 1; + goto unlock; + } + } + + *idx_skip = idx; +unlock: + rcu_read_unlock(); + return 0; +} + +/** + * batadv_bla_backbone_dump - dump backbone table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->bla.backbone_hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_bla_backbone_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + primary_if, head, &idx)) + break; + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index a80b9e96f28e..1ae93e46fb98 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -39,6 +39,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); +int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, @@ -120,6 +121,12 @@ static inline int batadv_bla_claim_dump(struct sk_buff *msg, return -EOPNOTSUPP; } +static inline int batadv_bla_backbone_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 464de9d05135..c3f68167591d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -600,6 +600,13 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_bla_claim_dump, }, + { + .cmd = BATADV_CMD_GET_BLA_BACKBONE, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_bla_backbone_dump, + }, + }; /** -- cgit v1.2.3 From 631fee7d70e8eabb642b4bcc58f08bbe880c91aa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 9 Aug 2016 06:51:06 -0700 Subject: net: Remove fib_local variable After commit 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") fib_local is set but not used. Remove it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - net/ipv4/fib_frontend.c | 7 ------- 2 files changed, 8 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d061ffeb1e71..7adf4386ac8f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -40,7 +40,6 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; - struct fib_table __rcu *fib_local; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ef2ebeb89d0f..317c31939732 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -93,9 +93,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id) return NULL; switch (id) { - case RT_TABLE_LOCAL: - rcu_assign_pointer(net->ipv4.fib_local, tb); - break; case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; @@ -137,9 +134,6 @@ static void fib_replace_table(struct net *net, struct fib_table *old, { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { - case RT_TABLE_LOCAL: - rcu_assign_pointer(net->ipv4.fib_local, new); - break; case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; @@ -1249,7 +1243,6 @@ static void ip_fib_net_exit(struct net *net) rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES - RCU_INIT_POINTER(net->ipv4.fib_local, NULL); RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif -- cgit v1.2.3 From e45a8a9e60ff1dd5ad118c794337a1101b46ab0d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 9 Aug 2016 18:27:08 +0200 Subject: xfrm: constify xfrm_replay structures The xfrm_replay structures are never modified, so declare them as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_replay.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index adfebd6f243c..d2fdd6d70959 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -187,7 +187,7 @@ struct xfrm_state { struct xfrm_replay_state_esn *preplay_esn; /* The functions for replay detection. */ - struct xfrm_replay *repl; + const struct xfrm_replay *repl; /* internal flag that only holds state for delayed aevent at the * moment diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 4fd725a0c500..cdc2e2e71bff 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) x->repl->notify(x, XFRM_REPLAY_UPDATE); } -static struct xfrm_replay xfrm_replay_legacy = { +static const struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, @@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = { .overflow = xfrm_replay_overflow, }; -static struct xfrm_replay xfrm_replay_bmp = { +static const struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, @@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = { .overflow = xfrm_replay_overflow_bmp, }; -static struct xfrm_replay xfrm_replay_esn = { +static const struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, -- cgit v1.2.3 From d737a5805581c6f99dad4caa9fdf80965d617d1a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Aug 2016 12:16:09 +0200 Subject: xfrm: state: don't use lock anymore unless acquire operation is needed push the lock down, after earlier patches we can rely on rcu to make sure state struct won't go away. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 6 +++--- net/xfrm/xfrm_state.c | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 24cd3949a9a4..1ab51d188408 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -38,9 +38,9 @@ struct netns_xfrm { * mode. Also, it can be used by ah/esp icmp error handler to find * offending SA. */ - struct hlist_head *state_bydst; - struct hlist_head *state_bysrc; - struct hlist_head *state_byspi; + struct hlist_head __rcu *state_bydst; + struct hlist_head __rcu *state_bysrc; + struct hlist_head __rcu *state_byspi; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 53e7867f9254..1a15b658a79e 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -799,7 +799,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, sequence = read_seqcount_begin(&xfrm_state_hash_generation); - spin_lock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { if (x->props.family == encap_family && @@ -870,6 +870,7 @@ found: } if (km_query(x, tmpl, pol) == 0) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); @@ -883,6 +884,7 @@ found: tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { x->km.state = XFRM_STATE_DEAD; to_put = x; @@ -899,7 +901,7 @@ out: } else { *err = acquire_in_progress ? -EAGAIN : error; } - spin_unlock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); -- cgit v1.2.3 From 59cc1f61f09c26ce82c308e24b76141e1efe99f8 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 10 Aug 2016 11:05:15 +0200 Subject: net: sched: convert qdisc linked list to hashtable Convert the per-device linked list into a hashtable. The primary motivation for this change is that currently, we're not tracking all the qdiscs in hierarchy (e.g. excluding default qdiscs), as the lookup performed over the linked list by qdisc_match_from_root() is rather expensive. The ultimate goal is to get rid of hidden qdiscs completely, which will bring much more determinism in user experience. Reviewed-by: Cong Wang Signed-off-by: Jiri Kosina Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ include/net/pkt_sched.h | 4 ++-- include/net/sch_generic.h | 2 +- net/core/dev.c | 3 +++ net/sched/sch_api.c | 23 +++++++++++++---------- net/sched/sch_generic.c | 8 +++++--- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 2 +- 8 files changed, 30 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 076df5360ba5..96e0b6cd964e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -1800,6 +1801,9 @@ struct net_device { unsigned int num_tx_queues; unsigned int real_num_tx_queues; struct Qdisc *qdisc; +#ifdef CONFIG_NET_SCHED + DECLARE_HASHTABLE (qdisc_hash, 4); +#endif unsigned long tx_queue_len; spinlock_t tx_global_lock; int watchdog_timeo; diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 7caa99b482c6..cd334c9584e9 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -90,8 +90,8 @@ int unregister_qdisc(struct Qdisc_ops *qops); void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); -void qdisc_list_add(struct Qdisc *q); -void qdisc_list_del(struct Qdisc *q); +void qdisc_hash_add(struct Qdisc *q); +void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 909aff2db2b3..0d501779cc68 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -61,7 +61,7 @@ struct Qdisc { u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; - struct list_head list; + struct hlist_node hash; u32 handle; u32 parent; void *u32_node; diff --git a/net/core/dev.c b/net/core/dev.c index 4ce07dc25573..936ea0054f57 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7629,6 +7629,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 12ebde845523..25aada7b095c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -263,33 +264,33 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - list_for_each_entry_rcu(q, &root->list, list) { + hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) { if (q->handle == handle) return q; } return NULL; } -void qdisc_list_add(struct Qdisc *q) +void qdisc_hash_add(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); - list_add_tail_rcu(&q->list, &root->list); + hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); } } -EXPORT_SYMBOL(qdisc_list_add); +EXPORT_SYMBOL(qdisc_hash_add); -void qdisc_list_del(struct Qdisc *q) +void qdisc_hash_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); - list_del_rcu(&q->list); + hash_del_rcu(&q->hash); } } -EXPORT_SYMBOL(qdisc_list_del); +EXPORT_SYMBOL(qdisc_hash_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { @@ -998,7 +999,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out4; } - qdisc_list_add(sch); + qdisc_hash_add(sch); return sch; } @@ -1435,6 +1436,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; + int b; if (!root) return 0; @@ -1449,7 +1451,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, goto done; q_idx++; } - list_for_each_entry(q, &root->list, list) { + hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (q_idx < s_q_idx) { q_idx++; continue; @@ -1765,6 +1767,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, int *t_p, int s_t) { struct Qdisc *q; + int b; if (!root) return 0; @@ -1772,7 +1775,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; - list_for_each_entry(q, &root->list, list) { + hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e95b67cd5718..18faecc3f13e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -423,7 +423,6 @@ struct Qdisc noop_qdisc = { .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, - .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .running = SEQCNT_ZERO(noop_qdisc.running), @@ -613,7 +612,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } - INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); spin_lock_init(&sch->busylock); @@ -700,7 +698,7 @@ void qdisc_destroy(struct Qdisc *qdisc) return; #ifdef CONFIG_NET_SCHED - qdisc_list_del(qdisc); + qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif @@ -788,6 +786,10 @@ static void attach_default_qdiscs(struct net_device *dev) qdisc->ops->attach(qdisc); } } +#ifdef CONFIG_NET_SCHED + if (dev->qdisc) + qdisc_hash_add(dev->qdisc); +#endif } static void transition_one_qdisc(struct net_device *dev, diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index b9439827c172..2bc8d7f8df16 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -88,7 +88,7 @@ static void mq_attach(struct Qdisc *sch) qdisc_destroy(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) - qdisc_list_add(qdisc); + qdisc_hash_add(qdisc); #endif } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 549c66359924..b5c502c78143 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -182,7 +182,7 @@ static void mqprio_attach(struct Qdisc *sch) if (old) qdisc_destroy(old); if (ntx < dev->real_num_tx_queues) - qdisc_list_add(qdisc); + qdisc_hash_add(qdisc); } kfree(priv->qdiscs); priv->qdiscs = NULL; -- cgit v1.2.3 From ab10dccb11608b96b43b557c12a5ad867723e503 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 9 Aug 2016 12:38:24 +0800 Subject: rps: Inspect PPTP encapsulated by GRE to get flow hash The PPTP is encapsulated by GRE header with that GRE_VERSION bits must contain one. But current GRE RPS needs the GRE_VERSION must be zero. So RPS does not work for PPTP traffic. In my test environment, there are four MIPS cores, and all traffic are passed through by PPTP. As a result, only one core is 100% busy while other three cores are very idle. After this patch, the usage of four cores are balanced well. Signed-off-by: Gao Feng Reviewed-by: Philip Prindeville Signed-off-by: David S. Miller --- drivers/net/ppp/pptp.c | 36 +------------ include/net/gre.h | 10 +++- include/net/pptp.h | 40 +++++++++++++++ include/uapi/linux/if_tunnel.h | 7 ++- net/core/flow_dissector.c | 113 ++++++++++++++++++++++++++++------------- 5 files changed, 135 insertions(+), 71 deletions(-) create mode 100644 include/net/pptp.h (limited to 'include') diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index ae0905ed4a32..3e68dbc0af7f 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -53,41 +54,6 @@ static struct proto pptp_sk_proto __read_mostly; static const struct ppp_channel_ops pptp_chan_ops; static const struct proto_ops pptp_ops; -#define PPP_LCP_ECHOREQ 0x09 -#define PPP_LCP_ECHOREP 0x0A -#define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) - -#define MISSING_WINDOW 20 -#define WRAPPED(curseq, lastseq)\ - ((((curseq) & 0xffffff00) == 0) &&\ - (((lastseq) & 0xffffff00) == 0xffffff00)) - -#define PPTP_GRE_PROTO 0x880B -#define PPTP_GRE_VER 0x1 - -#define PPTP_GRE_FLAG_C 0x80 -#define PPTP_GRE_FLAG_R 0x40 -#define PPTP_GRE_FLAG_K 0x20 -#define PPTP_GRE_FLAG_S 0x10 -#define PPTP_GRE_FLAG_A 0x80 - -#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C) -#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R) -#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K) -#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S) -#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A) - -#define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header)) -struct pptp_gre_header { - u8 flags; - u8 ver; - __be16 protocol; - __be16 payload_len; - __be16 call_id; - __be32 seq; - __be32 ack; -} __packed; - static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) { struct pppox_sock *sock; diff --git a/include/net/gre.h b/include/net/gre.h index 7a54a31d1d4c..8962e1e68449 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -7,7 +7,15 @@ struct gre_base_hdr { __be16 flags; __be16 protocol; -}; +} __packed; + +struct gre_full_hdr { + struct gre_base_hdr fixed_header; + __be16 csum; + __be16 reserved1; + __be32 key; + __be32 seq; +} __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 diff --git a/include/net/pptp.h b/include/net/pptp.h new file mode 100644 index 000000000000..301d3e2ba1f9 --- /dev/null +++ b/include/net/pptp.h @@ -0,0 +1,40 @@ +#ifndef _NET_PPTP_H +#define _NET_PPTP_H + +#define PPP_LCP_ECHOREQ 0x09 +#define PPP_LCP_ECHOREP 0x0A +#define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) + +#define MISSING_WINDOW 20 +#define WRAPPED(curseq, lastseq)\ + ((((curseq) & 0xffffff00) == 0) &&\ + (((lastseq) & 0xffffff00) == 0xffffff00)) + +#define PPTP_GRE_PROTO 0x880B +#define PPTP_GRE_VER 0x1 + +#define PPTP_GRE_FLAG_C 0x80 +#define PPTP_GRE_FLAG_R 0x40 +#define PPTP_GRE_FLAG_K 0x20 +#define PPTP_GRE_FLAG_S 0x10 +#define PPTP_GRE_FLAG_A 0x80 + +#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C) +#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R) +#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K) +#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S) +#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A) + +#define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header)) +struct pptp_gre_header { + u8 flags; + u8 ver; + __be16 protocol; + __be16 payload_len; + __be16 call_id; + __be32 seq; + __be32 ack; +} __packed; + + +#endif diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 1046f5515174..60dbb200de60 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -24,9 +24,14 @@ #define GRE_SEQ __cpu_to_be16(0x1000) #define GRE_STRICT __cpu_to_be16(0x0800) #define GRE_REC __cpu_to_be16(0x0700) -#define GRE_FLAGS __cpu_to_be16(0x00F8) +#define GRE_ACK __cpu_to_be16(0x0080) +#define GRE_FLAGS __cpu_to_be16(0x0078) #define GRE_VERSION __cpu_to_be16(0x0007) +#define GRE_VERSION_1 __cpu_to_be16(0x0001) +#define GRE_PROTO_PPP __cpu_to_be16(0x880b) +#define GRE_PPTP_KEY_MASK __cpu_to_be32(0xffff) + struct ip_tunnel_parm { char name[IFNAMSIZ]; int link; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 61ad43f61c5e..91028ae2fb01 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -338,32 +340,42 @@ mpls: ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - } *hdr, _hdr; + struct gre_base_hdr *hdr, _hdr; + u16 gre_ver; + int offset = 0; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) goto out_bad; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) break; - proto = hdr->proto; - nhoff += 4; + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + break; + + proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + break; + } + + offset += sizeof(struct gre_base_hdr); + if (hdr->flags & GRE_CSUM) - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->csum) + + sizeof(((struct gre_full_hdr *)0)->reserved1); + if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; - keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid), data, hlen, &_keyid); - if (!keyid) goto out_bad; @@ -372,32 +384,65 @@ ip_proto_again: key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); - key_keyid->keyid = *keyid; + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->key); } + if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) + offset += sizeof(((struct pptp_gre_header *)0)->seq); + + if (gre_ver == 0) { + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff + offset, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + goto out_bad; + proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = (nhoff + offset); + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *)0)->ack); + + ppp_hdr = skb_header_pointer(skb, nhoff + offset, + sizeof(_ppp_hdr), _ppp_hdr); + if (!ppp_hdr) goto out_bad; - proto = eth->h_proto; - nhoff += sizeof(*eth); - - /* Cap headers that we access via pointers at the - * end of the Ethernet header as our maximum alignment - * at that point is only 2 bytes. - */ - if (NET_IP_ALIGN) - hlen = nhoff; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; } + nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; -- cgit v1.2.3 From 054c67d1c82afde13e475cdd8b7117a5e40bebb1 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 9 Aug 2016 03:51:23 -0400 Subject: qed*: Add support for ethtool link_ksettings callbacks. This patch adds the driver implementation for ethtool link_ksettings callbacks. qed driver now defines/uses the qed specific masks for representing link capability values. qede driver maps these values to to new link modes defined by the kernel implementation of link_ksettings. Please consider applying this to 'net-next' branch. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 107 ++++++++++--------- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 3 + drivers/net/ethernet/qlogic/qed/qed_mcp.h | 7 +- drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 132 ++++++++++++++++++------ include/linux/qed/qed_if.h | 15 +++ 5 files changed, 180 insertions(+), 84 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index c7dc34bfdd0a..d6e1dc5fac94 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1025,20 +1025,23 @@ static int qed_set_link(struct qed_dev *cdev, struct qed_link_params *params) link_params->speed.autoneg = params->autoneg; if (params->override_flags & QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS) { link_params->speed.advertised_speeds = 0; - if ((params->adv_speeds & SUPPORTED_1000baseT_Half) || - (params->adv_speeds & SUPPORTED_1000baseT_Full)) + if ((params->adv_speeds & QED_LM_1000baseT_Half_BIT) || + (params->adv_speeds & QED_LM_1000baseT_Full_BIT)) link_params->speed.advertised_speeds |= - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G; - if (params->adv_speeds & SUPPORTED_10000baseKR_Full) + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G; + if (params->adv_speeds & QED_LM_10000baseKR_Full_BIT) link_params->speed.advertised_speeds |= - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G; - if (params->adv_speeds & SUPPORTED_40000baseLR4_Full) + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G; + if (params->adv_speeds & QED_LM_25000baseKR_Full_BIT) link_params->speed.advertised_speeds |= - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G; - if (params->adv_speeds & 0) + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G; + if (params->adv_speeds & QED_LM_40000baseLR4_Full_BIT) link_params->speed.advertised_speeds |= - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G; - if (params->adv_speeds & 0) + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G; + if (params->adv_speeds & QED_LM_50000baseKR2_Full_BIT) + link_params->speed.advertised_speeds |= + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G; + if (params->adv_speeds & QED_LM_100000baseKR4_Full_BIT) link_params->speed.advertised_speeds |= NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G; } @@ -1168,50 +1171,56 @@ static void qed_fill_link(struct qed_hwfn *hwfn, if_link->link_up = true; /* TODO - at the moment assume supported and advertised speed equal */ - if_link->supported_caps = SUPPORTED_FIBRE; + if_link->supported_caps = QED_LM_FIBRE_BIT; if (params.speed.autoneg) - if_link->supported_caps |= SUPPORTED_Autoneg; + if_link->supported_caps |= QED_LM_Autoneg_BIT; if (params.pause.autoneg || (params.pause.forced_rx && params.pause.forced_tx)) - if_link->supported_caps |= SUPPORTED_Asym_Pause; + if_link->supported_caps |= QED_LM_Asym_Pause_BIT; if (params.pause.autoneg || params.pause.forced_rx || params.pause.forced_tx) - if_link->supported_caps |= SUPPORTED_Pause; + if_link->supported_caps |= QED_LM_Pause_BIT; if_link->advertised_caps = if_link->supported_caps; if (params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) - if_link->advertised_caps |= SUPPORTED_1000baseT_Half | - SUPPORTED_1000baseT_Full; + if_link->advertised_caps |= QED_LM_1000baseT_Half_BIT | + QED_LM_1000baseT_Full_BIT; if (params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G) - if_link->advertised_caps |= SUPPORTED_10000baseKR_Full; + if_link->advertised_caps |= QED_LM_10000baseKR_Full_BIT; + if (params.speed.advertised_speeds & + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G) + if_link->advertised_caps |= QED_LM_25000baseKR_Full_BIT; if (params.speed.advertised_speeds & - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G) - if_link->advertised_caps |= SUPPORTED_40000baseLR4_Full; + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G) + if_link->advertised_caps |= QED_LM_40000baseLR4_Full_BIT; if (params.speed.advertised_speeds & - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G) - if_link->advertised_caps |= 0; + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G) + if_link->advertised_caps |= QED_LM_50000baseKR2_Full_BIT; if (params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G) - if_link->advertised_caps |= 0; + if_link->advertised_caps |= QED_LM_100000baseKR4_Full_BIT; if (link_caps.speed_capabilities & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) - if_link->supported_caps |= SUPPORTED_1000baseT_Half | - SUPPORTED_1000baseT_Full; + if_link->supported_caps |= QED_LM_1000baseT_Half_BIT | + QED_LM_1000baseT_Full_BIT; if (link_caps.speed_capabilities & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G) - if_link->supported_caps |= SUPPORTED_10000baseKR_Full; + if_link->supported_caps |= QED_LM_10000baseKR_Full_BIT; + if (link_caps.speed_capabilities & + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G) + if_link->supported_caps |= QED_LM_25000baseKR_Full_BIT; if (link_caps.speed_capabilities & - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G) - if_link->supported_caps |= SUPPORTED_40000baseLR4_Full; + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G) + if_link->supported_caps |= QED_LM_40000baseLR4_Full_BIT; if (link_caps.speed_capabilities & - NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G) - if_link->supported_caps |= 0; + NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G) + if_link->supported_caps |= QED_LM_50000baseKR2_Full_BIT; if (link_caps.speed_capabilities & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G) - if_link->supported_caps |= 0; + if_link->supported_caps |= QED_LM_100000baseKR4_Full_BIT; if (link.link_up) if_link->speed = link.speed; @@ -1231,33 +1240,29 @@ static void qed_fill_link(struct qed_hwfn *hwfn, if_link->pause_config |= QED_LINK_PAUSE_TX_ENABLE; /* Link partner capabilities */ - if (link.partner_adv_speed & - QED_LINK_PARTNER_SPEED_1G_HD) - if_link->lp_caps |= SUPPORTED_1000baseT_Half; - if (link.partner_adv_speed & - QED_LINK_PARTNER_SPEED_1G_FD) - if_link->lp_caps |= SUPPORTED_1000baseT_Full; - if (link.partner_adv_speed & - QED_LINK_PARTNER_SPEED_10G) - if_link->lp_caps |= SUPPORTED_10000baseKR_Full; - if (link.partner_adv_speed & - QED_LINK_PARTNER_SPEED_40G) - if_link->lp_caps |= SUPPORTED_40000baseLR4_Full; - if (link.partner_adv_speed & - QED_LINK_PARTNER_SPEED_50G) - if_link->lp_caps |= 0; - if (link.partner_adv_speed & - QED_LINK_PARTNER_SPEED_100G) - if_link->lp_caps |= 0; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_1G_HD) + if_link->lp_caps |= QED_LM_1000baseT_Half_BIT; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_1G_FD) + if_link->lp_caps |= QED_LM_1000baseT_Full_BIT; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_10G) + if_link->lp_caps |= QED_LM_10000baseKR_Full_BIT; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_25G) + if_link->lp_caps |= QED_LM_25000baseKR_Full_BIT; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_40G) + if_link->lp_caps |= QED_LM_40000baseLR4_Full_BIT; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_50G) + if_link->lp_caps |= QED_LM_50000baseKR2_Full_BIT; + if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_100G) + if_link->lp_caps |= QED_LM_100000baseKR4_Full_BIT; if (link.an_complete) - if_link->lp_caps |= SUPPORTED_Autoneg; + if_link->lp_caps |= QED_LM_Autoneg_BIT; if (link.partner_adv_pause) - if_link->lp_caps |= SUPPORTED_Pause; + if_link->lp_caps |= QED_LM_Pause_BIT; if (link.partner_adv_pause == QED_LINK_PARTNER_ASYMMETRIC_PAUSE || link.partner_adv_pause == QED_LINK_PARTNER_BOTH_PAUSE) - if_link->lp_caps |= SUPPORTED_Asym_Pause; + if_link->lp_caps |= QED_LM_Asym_Pause_BIT; } static void qed_get_current_link(struct qed_dev *cdev, diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index a240f26344a4..ce4b08a5fe99 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -634,6 +634,9 @@ static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn, p_link->partner_adv_speed |= (status & LINK_STATUS_LINK_PARTNER_20G_CAPABLE) ? QED_LINK_PARTNER_SPEED_20G : 0; + p_link->partner_adv_speed |= + (status & LINK_STATUS_LINK_PARTNER_25G_CAPABLE) ? + QED_LINK_PARTNER_SPEED_25G : 0; p_link->partner_adv_speed |= (status & LINK_STATUS_LINK_PARTNER_40G_CAPABLE) ? QED_LINK_PARTNER_SPEED_40G : 0; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index 7f319aa1b229..013d1b92ed63 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -60,9 +60,10 @@ struct qed_mcp_link_state { #define QED_LINK_PARTNER_SPEED_1G_FD BIT(1) #define QED_LINK_PARTNER_SPEED_10G BIT(2) #define QED_LINK_PARTNER_SPEED_20G BIT(3) -#define QED_LINK_PARTNER_SPEED_40G BIT(4) -#define QED_LINK_PARTNER_SPEED_50G BIT(5) -#define QED_LINK_PARTNER_SPEED_100G BIT(6) +#define QED_LINK_PARTNER_SPEED_25G BIT(4) +#define QED_LINK_PARTNER_SPEED_40G BIT(5) +#define QED_LINK_PARTNER_SPEED_50G BIT(6) +#define QED_LINK_PARTNER_SPEED_100G BIT(7) u32 partner_adv_speed; bool partner_tx_flow_ctrl_en; diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index f8492cac9290..0e0acfb5c1ed 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -249,78 +249,150 @@ static u32 qede_get_priv_flags(struct net_device *dev) return (!!(edev->dev_info.common.num_hwfns > 1)) << QEDE_PRI_FLAG_CMT; } -static int qede_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +struct qede_link_mode_mapping { + u32 qed_link_mode; + u32 ethtool_link_mode; +}; + +static const struct qede_link_mode_mapping qed_lm_map[] = { + {QED_LM_FIBRE_BIT, ETHTOOL_LINK_MODE_FIBRE_BIT}, + {QED_LM_Autoneg_BIT, ETHTOOL_LINK_MODE_Autoneg_BIT}, + {QED_LM_Asym_Pause_BIT, ETHTOOL_LINK_MODE_Asym_Pause_BIT}, + {QED_LM_Pause_BIT, ETHTOOL_LINK_MODE_Pause_BIT}, + {QED_LM_1000baseT_Half_BIT, ETHTOOL_LINK_MODE_1000baseT_Half_BIT}, + {QED_LM_1000baseT_Full_BIT, ETHTOOL_LINK_MODE_1000baseT_Full_BIT}, + {QED_LM_10000baseKR_Full_BIT, ETHTOOL_LINK_MODE_10000baseKR_Full_BIT}, + {QED_LM_25000baseKR_Full_BIT, ETHTOOL_LINK_MODE_25000baseKR_Full_BIT}, + {QED_LM_40000baseLR4_Full_BIT, ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT}, + {QED_LM_50000baseKR2_Full_BIT, ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT}, + {QED_LM_100000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT}, +}; + +#define QEDE_DRV_TO_ETHTOOL_CAPS(caps, lk_ksettings, name) \ +{ \ + int i; \ + \ + for (i = 0; i < QED_LM_COUNT; i++) { \ + if ((caps) & (qed_lm_map[i].qed_link_mode)) \ + __set_bit(qed_lm_map[i].ethtool_link_mode,\ + lk_ksettings->link_modes.name); \ + } \ +} + +#define QEDE_ETHTOOL_TO_DRV_CAPS(caps, lk_ksettings, name) \ +{ \ + int i; \ + \ + for (i = 0; i < QED_LM_COUNT; i++) { \ + if (test_bit(qed_lm_map[i].ethtool_link_mode, \ + lk_ksettings->link_modes.name)) \ + caps |= qed_lm_map[i].qed_link_mode; \ + } \ +} + +static int qede_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) { + struct ethtool_link_settings *base = &cmd->base; struct qede_dev *edev = netdev_priv(dev); struct qed_link_output current_link; memset(¤t_link, 0, sizeof(current_link)); edev->ops->common->get_link(edev->cdev, ¤t_link); - cmd->supported = current_link.supported_caps; - cmd->advertising = current_link.advertised_caps; + ethtool_link_ksettings_zero_link_mode(cmd, supported); + QEDE_DRV_TO_ETHTOOL_CAPS(current_link.supported_caps, cmd, supported) + + ethtool_link_ksettings_zero_link_mode(cmd, advertising); + QEDE_DRV_TO_ETHTOOL_CAPS(current_link.advertised_caps, cmd, advertising) + + ethtool_link_ksettings_zero_link_mode(cmd, lp_advertising); + QEDE_DRV_TO_ETHTOOL_CAPS(current_link.lp_caps, cmd, lp_advertising) + if ((edev->state == QEDE_STATE_OPEN) && (current_link.link_up)) { - ethtool_cmd_speed_set(cmd, current_link.speed); - cmd->duplex = current_link.duplex; + base->speed = current_link.speed; + base->duplex = current_link.duplex; } else { - cmd->duplex = DUPLEX_UNKNOWN; - ethtool_cmd_speed_set(cmd, SPEED_UNKNOWN); + base->speed = SPEED_UNKNOWN; + base->duplex = DUPLEX_UNKNOWN; } - cmd->port = current_link.port; - cmd->autoneg = (current_link.autoneg) ? AUTONEG_ENABLE : - AUTONEG_DISABLE; - cmd->lp_advertising = current_link.lp_caps; + base->port = current_link.port; + base->autoneg = (current_link.autoneg) ? AUTONEG_ENABLE : + AUTONEG_DISABLE; return 0; } -static int qede_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +static int qede_set_link_ksettings(struct net_device *dev, + const struct ethtool_link_ksettings *cmd) { + const struct ethtool_link_settings *base = &cmd->base; struct qede_dev *edev = netdev_priv(dev); struct qed_link_output current_link; struct qed_link_params params; - u32 speed; if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) { - DP_INFO(edev, - "Link settings are not allowed to be changed\n"); + DP_INFO(edev, "Link settings are not allowed to be changed\n"); return -EOPNOTSUPP; } - memset(¤t_link, 0, sizeof(current_link)); memset(¶ms, 0, sizeof(params)); edev->ops->common->get_link(edev->cdev, ¤t_link); - speed = ethtool_cmd_speed(cmd); params.override_flags |= QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS; params.override_flags |= QED_LINK_OVERRIDE_SPEED_AUTONEG; - if (cmd->autoneg == AUTONEG_ENABLE) { + if (base->autoneg == AUTONEG_ENABLE) { params.autoneg = true; params.forced_speed = 0; - params.adv_speeds = cmd->advertising; - } else { /* forced speed */ + QEDE_ETHTOOL_TO_DRV_CAPS(params.adv_speeds, cmd, advertising) + } else { /* forced speed */ params.override_flags |= QED_LINK_OVERRIDE_SPEED_FORCED_SPEED; params.autoneg = false; - params.forced_speed = speed; - switch (speed) { + params.forced_speed = base->speed; + switch (base->speed) { case SPEED_10000: if (!(current_link.supported_caps & - SUPPORTED_10000baseKR_Full)) { + QED_LM_10000baseKR_Full_BIT)) { DP_INFO(edev, "10G speed not supported\n"); return -EINVAL; } - params.adv_speeds = SUPPORTED_10000baseKR_Full; + params.adv_speeds = QED_LM_10000baseKR_Full_BIT; + break; + case SPEED_25000: + if (!(current_link.supported_caps & + QED_LM_25000baseKR_Full_BIT)) { + DP_INFO(edev, "25G speed not supported\n"); + return -EINVAL; + } + params.adv_speeds = QED_LM_25000baseKR_Full_BIT; break; case SPEED_40000: if (!(current_link.supported_caps & - SUPPORTED_40000baseLR4_Full)) { + QED_LM_40000baseLR4_Full_BIT)) { DP_INFO(edev, "40G speed not supported\n"); return -EINVAL; } - params.adv_speeds = SUPPORTED_40000baseLR4_Full; + params.adv_speeds = QED_LM_40000baseLR4_Full_BIT; + break; + case 0xdead: + if (!(current_link.supported_caps & + QED_LM_50000baseKR2_Full_BIT)) { + DP_INFO(edev, "50G speed not supported\n"); + return -EINVAL; + } + params.adv_speeds = QED_LM_50000baseKR2_Full_BIT; + break; + case 0xbeef: + if (!(current_link.supported_caps & + QED_LM_100000baseKR4_Full_BIT)) { + DP_INFO(edev, "100G speed not supported\n"); + return -EINVAL; + } + params.adv_speeds = QED_LM_100000baseKR4_Full_BIT; break; default: - DP_INFO(edev, "Unsupported speed %u\n", speed); + DP_INFO(edev, "Unsupported speed %u\n", base->speed); return -EINVAL; } } @@ -1228,8 +1300,8 @@ static int qede_get_tunable(struct net_device *dev, } static const struct ethtool_ops qede_ethtool_ops = { - .get_settings = qede_get_settings, - .set_settings = qede_set_settings, + .get_link_ksettings = qede_get_link_ksettings, + .set_link_ksettings = qede_set_link_ksettings, .get_drvinfo = qede_get_drvinfo, .get_msglevel = qede_get_msglevel, .set_msglevel = qede_set_msglevel, @@ -1260,7 +1332,7 @@ static const struct ethtool_ops qede_ethtool_ops = { }; static const struct ethtool_ops qede_vf_ethtool_ops = { - .get_settings = qede_get_settings, + .get_link_ksettings = qede_get_link_ksettings, .get_drvinfo = qede_get_drvinfo, .get_msglevel = qede_get_msglevel, .set_msglevel = qede_set_msglevel, diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b1e3c57c7117..737fc4c8db49 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -268,6 +268,21 @@ enum qed_protocol { QED_PROTOCOL_ISCSI, }; +enum qed_link_mode_bits { + QED_LM_FIBRE_BIT = BIT(0), + QED_LM_Autoneg_BIT = BIT(1), + QED_LM_Asym_Pause_BIT = BIT(2), + QED_LM_Pause_BIT = BIT(3), + QED_LM_1000baseT_Half_BIT = BIT(4), + QED_LM_1000baseT_Full_BIT = BIT(5), + QED_LM_10000baseKR_Full_BIT = BIT(6), + QED_LM_25000baseKR_Full_BIT = BIT(7), + QED_LM_40000baseLR4_Full_BIT = BIT(8), + QED_LM_50000baseKR2_Full_BIT = BIT(9), + QED_LM_100000baseKR4_Full_BIT = BIT(10), + QED_LM_COUNT = 11 +}; + struct qed_link_params { bool link_up; -- cgit v1.2.3 From a7c44247f704e385c77579d65c6ee6d002832529 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Aug 2016 15:17:56 +0200 Subject: xfrm: policy: make xfrm_policy_lookup_bytype lockless side effect: no longer disables BH (should be fine). Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_policy.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 1ab51d188408..3ab828a97e68 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -11,7 +11,7 @@ struct ctl_table_header; struct xfrm_policy_hash { - struct hlist_head *table; + struct hlist_head __rcu *table; unsigned int hmask; u8 dbits4; u8 sbits4; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 09f2e2b38246..9302647f20a0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1123,7 +1123,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if (unlikely(!daddr || !saddr)) return NULL; - read_lock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&xfrm_policy_hash_generation); @@ -1172,7 +1172,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: - read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return ret; } -- cgit v1.2.3 From 9d0380df6217e8dd014118fa1c99dda9974f3613 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Aug 2016 15:17:59 +0200 Subject: xfrm: policy: convert policy_lock to spinlock After earlier patches conversions all spots acquire the writer lock and we can now convert this to a normal spinlock. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_policy.c | 68 ++++++++++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 3ab828a97e68..177ed444d7b2 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -73,7 +73,7 @@ struct netns_xfrm { struct dst_ops xfrm6_dst_ops; #endif spinlock_t xfrm_state_lock; - rwlock_t xfrm_policy_lock; + spinlock_t xfrm_policy_lock; struct mutex xfrm_cfg_mutex; /* flow cache part */ diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 35b85a9a358c..dd01fd2e55fa 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -484,7 +484,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) if (!ndst) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, @@ -500,7 +500,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&xfrm_policy_hash_generation); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); @@ -519,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total) if (!nidx) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); @@ -527,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total) net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } @@ -617,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { @@ -659,7 +659,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_add_head(&policy->bydst, chain); } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } @@ -770,7 +770,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct hlist_head *chain; struct hlist_node *newpos; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); delpol = NULL; newpos = NULL; @@ -781,7 +781,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return -EEXIST; } delpol = pol; @@ -817,7 +817,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); @@ -837,7 +837,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, struct hlist_head *chain; *err = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); ret = NULL; hlist_for_each_entry(pol, chain, bydst) { @@ -850,7 +850,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -859,7 +859,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -878,7 +878,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, return NULL; *err = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { @@ -889,7 +889,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -898,7 +898,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -956,7 +956,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) @@ -972,14 +972,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); goto again1; } @@ -991,13 +991,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again2; } } @@ -1006,7 +1006,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (!cnt) err = -ESRCH; out: - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); @@ -1026,7 +1026,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else @@ -1054,7 +1054,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, } list_del_init(&walk->walk.all); out: - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); @@ -1073,9 +1073,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) if (list_empty(&walk->walk.all)) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ + spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); @@ -1321,9 +1321,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; @@ -1342,7 +1342,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) return -EINVAL; #endif - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { @@ -1360,7 +1360,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) */ xfrm_sk_policy_unlink(old_pol, dir); } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); @@ -1390,9 +1390,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->type = old->type; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; @@ -3074,7 +3074,7 @@ static int __net_init xfrm_net_init(struct net *net) /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); - rwlock_init(&net->xfrm.xfrm_policy_lock); + spin_lock_init(&net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); return 0; @@ -3206,7 +3206,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * struct hlist_head *chain; u32 priority = ~0U; - read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/ + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && @@ -3230,7 +3230,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * xfrm_pol_hold(ret); - read_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } -- cgit v1.2.3 From cb1b69b0b15b2897daeba8674c14c85a23a3347f Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Thu, 11 Aug 2016 18:02:07 +0200 Subject: netfilter: nf_tables: add hash expression This patch adds a new hash expression, this provides jhash support but this can be extended to support for other hash functions. The modulus and seed already comes embedded into this new expression. Use case example: ... meta mark set hash ip saddr mod 10 Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 20 +++++ net/netfilter/Kconfig | 6 ++ net/netfilter/Makefile | 1 + net/netfilter/nft_hash.c | 136 +++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+) create mode 100644 net/netfilter/nft_hash.c (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 01751faccaf8..6ce0a6dd0889 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -723,6 +723,26 @@ enum nft_meta_keys { NFT_META_PRANDOM, }; +/** + * enum nft_hash_attributes - nf_tables hash expression netlink attributes + * + * @NFTA_HASH_SREG: source register (NLA_U32) + * @NFTA_HASH_DREG: destination register (NLA_U32) + * @NFTA_HASH_LEN: source data length (NLA_U32) + * @NFTA_HASH_MODULUS: modulus value (NLA_U32) + * @NFTA_HASH_SEED: seed value (NLA_U32) + */ +enum nft_hash_attributes { + NFTA_HASH_UNSPEC, + NFTA_HASH_SREG, + NFTA_HASH_DREG, + NFTA_HASH_LEN, + NFTA_HASH_MODULUS, + NFTA_HASH_SEED, + __NFTA_HASH_MAX, +}; +#define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) + /** * enum nft_meta_attributes - nf_tables meta expression netlink attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e5740e108a0b..9cfaa00c79b2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -563,6 +563,12 @@ config NFT_COMPAT x_tables match/target extensions over the nf_tables framework. +config NFT_HASH + tristate "Netfilter nf_tables hash module" + help + This option adds the "hash" expression that you can use to perform + a hash operation on registers. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 101fb859203c..1106ccde215c 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o obj-$(CONFIG_NFT_REDIR) += nft_redir.o +obj-$(CONFIG_NFT_HASH) += nft_hash.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c new file mode 100644 index 000000000000..b82ff29b3f5f --- /dev/null +++ b/net/netfilter/nft_hash.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016 Laura Garcia + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_hash { + enum nft_registers sreg:8; + enum nft_registers dreg:8; + u8 len; + u32 modulus; + u32 seed; +}; + +static void nft_hash_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_hash *priv = nft_expr_priv(expr); + const void *data = ®s->data[priv->sreg]; + + regs->data[priv->dreg] = + reciprocal_scale(jhash(data, priv->len, priv->seed), + priv->modulus); +} + +const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { + [NFTA_HASH_SREG] = { .type = NLA_U32 }, + [NFTA_HASH_DREG] = { .type = NLA_U32 }, + [NFTA_HASH_LEN] = { .type = NLA_U32 }, + [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, + [NFTA_HASH_SEED] = { .type = NLA_U32 }, +}; + +static int nft_hash_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_hash *priv = nft_expr_priv(expr); + u32 len; + + if (!tb[NFTA_HASH_SREG] || + !tb[NFTA_HASH_DREG] || + !tb[NFTA_HASH_LEN] || + !tb[NFTA_HASH_SEED] || + !tb[NFTA_HASH_MODULUS]) + return -EINVAL; + + priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); + priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); + + len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN])); + if (len == 0 || len > U8_MAX) + return -ERANGE; + + priv->len = len; + + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); + if (priv->modulus <= 1) + return -ERANGE; + + priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); + + return nft_validate_register_load(priv->sreg, len) && + nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + +static int nft_hash_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_hash *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_HASH_LEN, priv->len)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_HASH_MODULUS, priv->modulus)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_HASH_SEED, priv->seed)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_hash_type; +static const struct nft_expr_ops nft_hash_ops = { + .type = &nft_hash_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_hash)), + .eval = nft_hash_eval, + .init = nft_hash_init, + .dump = nft_hash_dump, +}; + +static struct nft_expr_type nft_hash_type __read_mostly = { + .name = "hash", + .ops = &nft_hash_ops, + .policy = nft_hash_policy, + .maxattr = NFTA_HASH_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_hash_module_init(void) +{ + return nft_register_expr(&nft_hash_type); +} + +static void __exit nft_hash_module_exit(void) +{ + nft_unregister_expr(&nft_hash_type); +} + +module_init(nft_hash_module_init); +module_exit(nft_hash_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laura Garcia "); +MODULE_ALIAS_NFT_EXPR("hash"); -- cgit v1.2.3 From 300d8b93d73533411a98362350fe7d1795f1a044 Mon Sep 17 00:00:00 2001 From: Appana Durga Kedareswara Rao Date: Wed, 10 Aug 2016 11:20:06 +0530 Subject: net: Add mask for Control register 10Mbps speed This patch adds mask for the Control register 10Mbps speed. Reviewed-by: Florian Fainelli Signed-off-by: Kedareswara rao Appana Signed-off-by: David S. Miller --- include/uapi/linux/mii.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index 237fac4bc17b..15d8510cdae0 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -48,6 +48,7 @@ #define BMCR_SPEED100 0x2000 /* Select 100Mbps */ #define BMCR_LOOPBACK 0x4000 /* TXD loopback bits */ #define BMCR_RESET 0x8000 /* Reset to default state */ +#define BMCR_SPEED10 0x0000 /* Select 10Mbps */ /* Basic mode status register. */ #define BMSR_ERCAP 0x0001 /* Ext-reg capability */ -- cgit v1.2.3 From aed704b7a634954dc28fe5c4b49db478cf2d96b7 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Fri, 12 Aug 2016 08:56:40 -0700 Subject: cgroup: Add task_under_cgroup_hierarchy cgroup inline function to headers This commit adds an inline function to cgroup.h to check whether a given task is under a given cgroup hierarchy. This is to avoid having to put ifdefs in .c files to gate access to cgroups. When cgroups are disabled this always returns true. Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 984f73b719a9..a4414a11eea7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -497,6 +497,23 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry + * @task: the task to be tested + * @ancestor: possible ancestor of @task's cgroup + * + * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. + * It follows all the same rules as cgroup_is_descendant, and only applies + * to the default hierarchy. + */ +static inline bool task_under_cgroup_hierarchy(struct task_struct *task, + struct cgroup *ancestor) +{ + struct css_set *cset = task_css_set(task); + + return cgroup_is_descendant(cset->dfl_cgrp, ancestor); +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { @@ -557,6 +574,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; +struct cgroup; static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -574,6 +592,11 @@ static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } +static inline bool task_under_cgroup_hierarchy(struct task_struct *task, + struct cgroup *ancestor) +{ + return true; +} #endif /* !CONFIG_CGROUPS */ /* -- cgit v1.2.3 From 60d20f9195b260bdf0ac10c275ae9f6016f9c069 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Fri, 12 Aug 2016 08:56:52 -0700 Subject: bpf: Add bpf_current_task_under_cgroup helper This adds a bpf helper that's similar to the skb_in_cgroup helper to check whether the probe is currently executing in the context of a specific subset of the cgroupsv2 hierarchy. It does this based on membership test for a cgroup arraymap. It is invalid to call this in an interrupt, and it'll return an error. The helper is primarily to be used in debugging activities for containers, where you may have multiple programs running in a given top-level "container". Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Tejun Heo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/arraymap.c | 2 +- kernel/bpf/verifier.c | 4 +++- kernel/trace/bpf_trace.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da218fec6056..bea0c4e2830a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -375,6 +375,17 @@ enum bpf_func_id { */ BPF_FUNC_probe_write_user, + /** + * bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership of current task + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 current failed the cgroup2 descendant test + * == 1 current succeeded the cgroup2 descendant test + * < 0 error + */ + BPF_FUNC_current_task_under_cgroup, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633a650d7aeb..a2ac051c342f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void) } late_initcall(register_perf_event_array_map); -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7094c69ac199..d504722ebfa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_in_cgroup) + if (func_id != BPF_FUNC_skb_in_cgroup && + func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; default: @@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_current_task_under_cgroup: case BPF_FUNC_skb_in_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..6b794d6669a7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -376,6 +376,34 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *)(long)r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + u32 idx = (u32)r2; + + if (unlikely(in_interrupt())) + return -EINVAL; + + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -407,6 +435,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: return bpf_get_probe_write_proto(); + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; default: return NULL; } -- cgit v1.2.3 From adf0516845bcd0e626323c858ece28ee58c74455 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Aug 2016 13:47:06 +0200 Subject: netfilter: remove ip_conntrack* sysctl compat code This backward compatibility has been around for more than ten years, since Yasuyuki Kozakai introduced IPv6 in conntrack. These days, we have alternate /proc/net/nf_conntrack* entries, the ctnetlink interface and the conntrack utility got adopted by many people in the user community according to what I observed on the netfilter user mailing list. So let's get rid of this. Note that nf_conntrack_htable_size and unsigned int nf_conntrack_max do not need to be exported as symbol anymore. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 8 - include/net/netns/conntrack.h | 8 - net/ipv4/netfilter/Kconfig | 11 - net/ipv4/netfilter/Makefile | 5 - net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 70 --- .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 491 --------------------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 39 +- net/netfilter/nf_conntrack_core.c | 3 - net/netfilter/nf_conntrack_proto.c | 81 +--- net/netfilter/nf_conntrack_proto_generic.c | 39 +- net/netfilter/nf_conntrack_proto_sctp.c | 85 +--- net/netfilter/nf_conntrack_proto_tcp.c | 127 +----- net/netfilter/nf_conntrack_proto_udp.c | 49 +- 13 files changed, 7 insertions(+), 1009 deletions(-) delete mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 1a5fb36f165f..de629f1520df 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -134,14 +134,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto); -static inline void nf_ct_kfree_compat_sysctl_table(struct nf_proto_net *pn) -{ -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - kfree(pn->ctl_compat_table); - pn->ctl_compat_table = NULL; -#endif -} - /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 38b1a80517f0..e469e85de3f9 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -15,10 +15,6 @@ struct nf_proto_net { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_header; struct ctl_table *ctl_table; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - struct ctl_table_header *ctl_compat_header; - struct ctl_table *ctl_compat_table; -#endif #endif unsigned int users; }; @@ -58,10 +54,6 @@ struct nf_ip_net { struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - struct ctl_table_header *ctl_table_header; - struct ctl_table *ctl_table; -#endif }; struct ct_pcpu { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c187c60e3e0c..d613309e3e5d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -25,17 +25,6 @@ config NF_CONNTRACK_IPV4 To compile it as a module, choose M here. If unsure, say N. -config NF_CONNTRACK_PROC_COMPAT - bool "proc/sysctl compatibility with old connection tracking" - depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4 - default y - help - This option enables /proc and sysctl compatibility with the old - layer 3 dependent connection tracking. This is needed to keep - old programs that have not been adapted to the new names working. - - If unsure, say Y. - if NF_TABLES config NF_TABLES_IPV4 diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 87b073da14c9..853328f8fd05 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -4,11 +4,6 @@ # objects for l3 independent conntrack nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o -ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) -ifeq ($(CONFIG_PROC_FS),y) -nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o -endif -endif # connection tracking obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index ae1a71a97132..870aebda2932 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -202,47 +202,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { }, }; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; - -static struct ctl_table ip_ct_sysctl_table[] = { - { - .procname = "ip_conntrack_max", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_count", - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_buckets", - .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_checksum", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_log_invalid", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, - }, - { } -}; -#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ - /* Fast function for those who don't want to parse /proc (and I don't blame them). */ /* Reversing the socket's dst/src point of view gives us the reply @@ -350,20 +309,6 @@ static struct nf_sockopt_ops so_getorigdst = { static int ipv4_init_net(struct net *net) { -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - struct nf_ip_net *in = &net->ct.nf_ct_proto; - in->ctl_table = kmemdup(ip_ct_sysctl_table, - sizeof(ip_ct_sysctl_table), - GFP_KERNEL); - if (!in->ctl_table) - return -ENOMEM; - - in->ctl_table[0].data = &nf_conntrack_max; - in->ctl_table[1].data = &net->ct.count; - in->ctl_table[2].data = &nf_conntrack_htable_size; - in->ctl_table[3].data = &net->ct.sysctl_checksum; - in->ctl_table[4].data = &net->ct.sysctl_log_invalid; -#endif return 0; } @@ -379,9 +324,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .nlattr_tuple_size = ipv4_nlattr_tuple_size, .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, -#endif -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - .ctl_table_path = "net/ipv4/netfilter", #endif .init_net = ipv4_init_net, .me = THIS_MODULE, @@ -492,16 +434,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) goto cleanup_icmpv4; } -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - ret = nf_conntrack_ipv4_compat_init(); - if (ret < 0) - goto cleanup_proto; -#endif return ret; -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - cleanup_proto: - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); -#endif cleanup_icmpv4: nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); cleanup_udp4: @@ -520,9 +453,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) static void __exit nf_conntrack_l3proto_ipv4_fini(void) { synchronize_net(); -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - nf_conntrack_ipv4_compat_fini(); -#endif nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c deleted file mode 100644 index 67bfc69e00bc..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ /dev/null @@ -1,491 +0,0 @@ -/* ip_conntrack proc compat - based on ip_conntrack_standalone.c - * - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * (C) 2006-2010 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -struct ct_iter_state { - struct seq_net_private p; - struct hlist_nulls_head *hash; - unsigned int htable_size; - unsigned int bucket; -}; - -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) -{ - struct ct_iter_state *st = seq->private; - struct hlist_nulls_node *n; - - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} - -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; - - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; - } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} - -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); - - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *ct_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - struct ct_iter_state *st = seq->private; - - rcu_read_lock(); - - nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); -} - -static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return ct_get_next(s, v); -} - -static void ct_seq_stop(struct seq_file *s, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -#ifdef CONFIG_NF_CONNTRACK_SECMARK -static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) -{ - int ret; - u32 len; - char *secctx; - - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) - return; - - seq_printf(s, "secctx=%s ", secctx); - - security_release_secctx(secctx, len); -} -#else -static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) -{ -} -#endif - -static bool ct_seq_should_skip(const struct nf_conn *ct, - const struct net *net, - const struct nf_conntrack_tuple_hash *hash) -{ - /* we only want to print DIR_ORIGINAL */ - if (NF_CT_DIRECTION(hash)) - return true; - - if (nf_ct_l3num(ct) != AF_INET) - return true; - - if (!net_eq(nf_ct_net(ct), net)) - return true; - - return false; -} - -static int ct_seq_show(struct seq_file *s, void *v) -{ - struct nf_conntrack_tuple_hash *hash = v; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); - const struct nf_conntrack_l3proto *l3proto; - const struct nf_conntrack_l4proto *l4proto; - int ret = 0; - - NF_CT_ASSERT(ct); - if (ct_seq_should_skip(ct, seq_file_net(s), hash)) - return 0; - - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) - return 0; - - /* check if we raced w. object reuse */ - if (!nf_ct_is_confirmed(ct) || - ct_seq_should_skip(ct, seq_file_net(s), hash)) - goto release; - - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - NF_CT_ASSERT(l3proto); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - NF_CT_ASSERT(l4proto); - - ret = -ENOSPC; - seq_printf(s, "%-8s %u %ld ", - l4proto->name, nf_ct_protonum(ct), - nf_ct_expires(ct) / HZ); - - if (l4proto->print_conntrack) - l4proto->print_conntrack(s, ct); - - if (seq_has_overflowed(s)) - goto release; - - print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto); - - if (seq_has_overflowed(s)) - goto release; - - if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) - goto release; - - if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) - seq_printf(s, "[UNREPLIED] "); - - print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto); - - if (seq_has_overflowed(s)) - goto release; - - if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) - goto release; - - if (test_bit(IPS_ASSURED_BIT, &ct->status)) - seq_printf(s, "[ASSURED] "); - -#ifdef CONFIG_NF_CONNTRACK_MARK - seq_printf(s, "mark=%u ", ct->mark); -#endif - - ct_show_secctx(s, ct); - - seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); - - if (seq_has_overflowed(s)) - goto release; - - ret = 0; -release: - nf_ct_put(ct); - return ret; -} - -static const struct seq_operations ct_seq_ops = { - .start = ct_seq_start, - .next = ct_seq_next, - .stop = ct_seq_stop, - .show = ct_seq_show -}; - -static int ct_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ct_seq_ops, - sizeof(struct ct_iter_state)); -} - -static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, - .open = ct_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -/* expects */ -struct ct_expect_iter_state { - struct seq_net_private p; - unsigned int bucket; -}; - -static struct hlist_node *ct_expect_get_first(struct seq_file *seq) -{ - struct ct_expect_iter_state *st = seq->private; - struct hlist_node *n; - - for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference( - hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); - if (n) - return n; - } - return NULL; -} - -static struct hlist_node *ct_expect_get_next(struct seq_file *seq, - struct hlist_node *head) -{ - struct ct_expect_iter_state *st = seq->private; - - head = rcu_dereference(hlist_next_rcu(head)); - while (head == NULL) { - if (++st->bucket >= nf_ct_expect_hsize) - return NULL; - head = rcu_dereference( - hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); - } - return head; -} - -static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_node *head = ct_expect_get_first(seq); - - if (head) - while (pos && (head = ct_expect_get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *exp_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - rcu_read_lock(); - return ct_expect_get_idx(seq, *pos); -} - -static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - return ct_expect_get_next(seq, v); -} - -static void exp_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static int exp_seq_show(struct seq_file *s, void *v) -{ - struct nf_conntrack_expect *exp; - const struct hlist_node *n = v; - - exp = hlist_entry(n, struct nf_conntrack_expect, hnode); - - if (!net_eq(nf_ct_net(exp->master), seq_file_net(s))) - return 0; - - if (exp->tuple.src.l3num != AF_INET) - return 0; - - if (exp->timeout.function) - seq_printf(s, "%ld ", timer_pending(&exp->timeout) - ? (long)(exp->timeout.expires - jiffies)/HZ : 0); - else - seq_printf(s, "- "); - - seq_printf(s, "proto=%u ", exp->tuple.dst.protonum); - - print_tuple(s, &exp->tuple, - __nf_ct_l3proto_find(exp->tuple.src.l3num), - __nf_ct_l4proto_find(exp->tuple.src.l3num, - exp->tuple.dst.protonum)); - seq_putc(s, '\n'); - - return 0; -} - -static const struct seq_operations exp_seq_ops = { - .start = exp_seq_start, - .next = exp_seq_next, - .stop = exp_seq_stop, - .show = exp_seq_show -}; - -static int exp_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &exp_seq_ops, - sizeof(struct ct_expect_iter_state)); -} - -static const struct file_operations ip_exp_file_ops = { - .owner = THIS_MODULE, - .open = exp_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - int cpu; - - if (*pos == 0) - return SEQ_START_TOKEN; - - for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return per_cpu_ptr(net->ct.stat, cpu); - } - - return NULL; -} - -static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - int cpu; - - for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return per_cpu_ptr(net->ct.stat, cpu); - } - - return NULL; -} - -static void ct_cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int ct_cpu_seq_show(struct seq_file *seq, void *v) -{ - struct net *net = seq_file_net(seq); - unsigned int nr_conntracks = atomic_read(&net->ct.count); - const struct ip_conntrack_stat *st = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); - return 0; - } - - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " - "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - nr_conntracks, - st->searched, - st->found, - st->new, - st->invalid, - st->ignore, - st->delete, - st->delete_list, - st->insert, - st->insert_failed, - st->drop, - st->early_drop, - st->error, - - st->expect_new, - st->expect_create, - st->expect_delete, - st->search_restart - ); - return 0; -} - -static const struct seq_operations ct_cpu_seq_ops = { - .start = ct_cpu_seq_start, - .next = ct_cpu_seq_next, - .stop = ct_cpu_seq_stop, - .show = ct_cpu_seq_show, -}; - -static int ct_cpu_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ct_cpu_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations ct_cpu_seq_fops = { - .owner = THIS_MODULE, - .open = ct_cpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static int __net_init ip_conntrack_net_init(struct net *net) -{ - struct proc_dir_entry *proc, *proc_exp, *proc_stat; - - proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops); - if (!proc) - goto err1; - - proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net, - &ip_exp_file_ops); - if (!proc_exp) - goto err2; - - proc_stat = proc_create("ip_conntrack", S_IRUGO, - net->proc_net_stat, &ct_cpu_seq_fops); - if (!proc_stat) - goto err3; - return 0; - -err3: - remove_proc_entry("ip_conntrack_expect", net->proc_net); -err2: - remove_proc_entry("ip_conntrack", net->proc_net); -err1: - return -ENOMEM; -} - -static void __net_exit ip_conntrack_net_exit(struct net *net) -{ - remove_proc_entry("ip_conntrack", net->proc_net_stat); - remove_proc_entry("ip_conntrack_expect", net->proc_net); - remove_proc_entry("ip_conntrack", net->proc_net); -} - -static struct pernet_operations ip_conntrack_net_ops = { - .init = ip_conntrack_net_init, - .exit = ip_conntrack_net_exit, -}; - -int __init nf_conntrack_ipv4_compat_init(void) -{ - return register_pernet_subsys(&ip_conntrack_net_ops); -} - -void __exit nf_conntrack_ipv4_compat_fini(void) -{ - unregister_pernet_subsys(&ip_conntrack_net_ops); -} diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index c567e1b5d799..4b5904bc2614 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -327,17 +327,6 @@ static struct ctl_table icmp_sysctl_table[] = { }, { } }; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table icmp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -355,40 +344,14 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, - sizeof(icmp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &in->timeout; -#endif -#endif - return 0; -} - static int icmp_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_icmp_net *in = icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; - ret = icmp_kmemdup_compat_sysctl_table(pn, in); - if (ret < 0) - return ret; - - ret = icmp_kmemdup_sysctl_table(pn, in); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - - return ret; + return icmp_kmemdup_sysctl_table(pn, in); } static struct nf_proto_net *icmp_get_net_proto(struct net *net) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index dd2c43abf9e2..22558b7ff7cd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -161,10 +161,7 @@ static void nf_conntrack_all_unlock(void) } unsigned int nf_conntrack_htable_size __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); - unsigned int nf_conntrack_max __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b65d5864b6d9..8d2c7d8c666a 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -159,54 +159,6 @@ static int kill_l4proto(struct nf_conn *i, void *data) nf_ct_l3num(i) == l4proto->l3proto; } -static struct nf_ip_net *nf_ct_l3proto_net(struct net *net, - struct nf_conntrack_l3proto *l3proto) -{ - if (l3proto->l3proto == PF_INET) - return &net->ct.nf_ct_proto; - else - return NULL; -} - -static int nf_ct_l3proto_register_sysctl(struct net *net, - struct nf_conntrack_l3proto *l3proto) -{ - int err = 0; - struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); - /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */ - if (in == NULL) - return 0; - -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - if (in->ctl_table != NULL) { - err = nf_ct_register_sysctl(net, - &in->ctl_table_header, - l3proto->ctl_table_path, - in->ctl_table); - if (err < 0) { - kfree(in->ctl_table); - in->ctl_table = NULL; - } - } -#endif - return err; -} - -static void nf_ct_l3proto_unregister_sysctl(struct net *net, - struct nf_conntrack_l3proto *l3proto) -{ - struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); - - if (in == NULL) - return; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - if (in->ctl_table_header != NULL) - nf_ct_unregister_sysctl(&in->ctl_table_header, - &in->ctl_table, - 0); -#endif -} - int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; @@ -241,7 +193,7 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); int nf_ct_l3proto_pernet_register(struct net *net, struct nf_conntrack_l3proto *proto) { - int ret = 0; + int ret; if (proto->init_net) { ret = proto->init_net(net); @@ -249,7 +201,7 @@ int nf_ct_l3proto_pernet_register(struct net *net, return ret; } - return nf_ct_l3proto_register_sysctl(net, proto); + return 0; } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); @@ -272,8 +224,6 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); void nf_ct_l3proto_pernet_unregister(struct net *net, struct nf_conntrack_l3proto *proto) { - nf_ct_l3proto_unregister_sysctl(net, proto); - /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } @@ -312,26 +262,6 @@ int nf_ct_l4proto_register_sysctl(struct net *net, } } } -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) { - if (err < 0) { - nf_ct_kfree_compat_sysctl_table(pn); - goto out; - } - err = nf_ct_register_sysctl(net, - &pn->ctl_compat_header, - "net/ipv4/netfilter", - pn->ctl_compat_table); - if (err == 0) - goto out; - - nf_ct_kfree_compat_sysctl_table(pn); - nf_ct_unregister_sysctl(&pn->ctl_table_header, - &pn->ctl_table, - pn->users); - } -out: -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ return err; } @@ -346,13 +276,6 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net, nf_ct_unregister_sysctl(&pn->ctl_table_header, &pn->ctl_table, pn->users); - -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL) - nf_ct_unregister_sysctl(&pn->ctl_compat_header, - &pn->ctl_compat_table, - 0); -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ } diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 86dc752e5349..d5868bad33a7 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -151,17 +151,6 @@ static struct ctl_table generic_sysctl_table[] = { }, { } }; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table generic_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_generic_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -179,40 +168,14 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_generic_net *gn) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table, - sizeof(generic_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &gn->timeout; -#endif -#endif - return 0; -} - static int generic_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_generic_net *gn = generic_pernet(net); struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; - ret = generic_kmemdup_compat_sysctl_table(pn, gn); - if (ret < 0) - return ret; - - ret = generic_kmemdup_sysctl_table(pn, gn); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - - return ret; + return generic_kmemdup_sysctl_table(pn, gn); } static struct nf_proto_net *generic_get_net_proto(struct net *net) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index e769f0561621..982ea62606c7 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -705,54 +705,6 @@ static struct ctl_table sctp_sysctl_table[] = { }, { } }; - -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table sctp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_sctp_timeout_closed", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_established", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -781,32 +733,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct sctp_net *sn) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table, - sizeof(sctp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; - pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; - pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; - pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; - pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; - pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; - pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; -#endif -#endif - return 0; -} - static int sctp_init_net(struct net *net, u_int16_t proto) { - int ret; struct sctp_net *sn = sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; @@ -817,18 +745,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto) sn->timeouts[i] = sctp_timeouts[i]; } - if (proto == AF_INET) { - ret = sctp_kmemdup_compat_sysctl_table(pn, sn); - if (ret < 0) - return ret; - - ret = sctp_kmemdup_sysctl_table(pn, sn); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - } else - ret = sctp_kmemdup_sysctl_table(pn, sn); - - return ret; + return sctp_kmemdup_sysctl_table(pn, sn); } static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 4abe9e1f8909..69f687740c76 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1481,90 +1481,6 @@ static struct ctl_table tcp_sysctl_table[] = { }, { } }; - -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table tcp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_tcp_timeout_syn_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_syn_sent2", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_syn_recv", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_established", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_fin_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_close_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_last_ack", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_time_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_close", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_max_retrans", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_loose", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_tcp_be_liberal", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_tcp_max_retrans", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -1597,38 +1513,8 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_tcp_net *tn) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, - sizeof(tcp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; - pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2]; - pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; - pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; - pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; - pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; - pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; - pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; - pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; - pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; - pn->ctl_compat_table[10].data = &tn->tcp_loose; - pn->ctl_compat_table[11].data = &tn->tcp_be_liberal; - pn->ctl_compat_table[12].data = &tn->tcp_max_retrans; -#endif -#endif - return 0; -} - static int tcp_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_tcp_net *tn = tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; @@ -1643,18 +1529,7 @@ static int tcp_init_net(struct net *net, u_int16_t proto) tn->tcp_max_retrans = nf_ct_tcp_max_retrans; } - if (proto == AF_INET) { - ret = tcp_kmemdup_compat_sysctl_table(pn, tn); - if (ret < 0) - return ret; - - ret = tcp_kmemdup_sysctl_table(pn, tn); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - } else - ret = tcp_kmemdup_sysctl_table(pn, tn); - - return ret; + return tcp_kmemdup_sysctl_table(pn, tn); } static struct nf_proto_net *tcp_get_net_proto(struct net *net) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8a057e1e1247..20f35ed68030 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -218,23 +218,6 @@ static struct ctl_table udp_sysctl_table[] = { }, { } }; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table udp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_udp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_udp_timeout_stream", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -254,27 +237,8 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_udp_net *un) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table, - sizeof(udp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; - pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED]; -#endif -#endif - return 0; -} - static int udp_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_udp_net *un = udp_pernet(net); struct nf_proto_net *pn = &un->pn; @@ -285,18 +249,7 @@ static int udp_init_net(struct net *net, u_int16_t proto) un->timeouts[i] = udp_timeouts[i]; } - if (proto == AF_INET) { - ret = udp_kmemdup_compat_sysctl_table(pn, un); - if (ret < 0) - return ret; - - ret = udp_kmemdup_sysctl_table(pn, un); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - } else - ret = udp_kmemdup_sysctl_table(pn, un); - - return ret; + return udp_kmemdup_sysctl_table(pn, un); } static struct nf_proto_net *udp_get_net_proto(struct net *net) -- cgit v1.2.3 From 04ed5ad5db6880d53dd1bb8c93e82228a462a4dd Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 17 Jul 2016 01:28:47 +0300 Subject: net/mlx5: Init/Teardown hca commands via mlx5 ifc Remove old representation of manually created Init/Teardown hca commands layout and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 36 ++++++++-------------------- include/linux/mlx5/device.h | 24 ------------------- 2 files changed, 10 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 77fc1aa26114..56bf520d1429 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -162,38 +162,22 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) { - struct mlx5_cmd_init_hca_mbox_in in; - struct mlx5_cmd_init_hca_mbox_out out; + u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_INIT_HCA); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); - - return err; + MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) { - struct mlx5_cmd_teardown_hca_mbox_in in; - struct mlx5_cmd_teardown_hca_mbox_out out; + u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_TEARDOWN_HCA); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); - - return err; + MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b6d15cddb2f..6c343c0b77d2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -455,30 +455,6 @@ struct mlx5_odp_caps { char reserved2[0xe4]; }; -struct mlx5_cmd_init_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[2]; - __be16 profile; - u8 rsvd1[4]; -}; - -struct mlx5_cmd_init_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_cmd_teardown_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[2]; - __be16 profile; - u8 rsvd1[4]; -}; - -struct mlx5_cmd_teardown_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - struct mlx5_cmd_layout { u8 type; u8 rsvd0[3]; -- cgit v1.2.3 From 20ed51c643b6296789a48adc3bc2cc875a1612cf Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 17 Jul 2016 00:46:41 +0300 Subject: net/mlx5: Access register and MAD IFC commands via mlx5 ifc Remove old representation of manually created ACCESS_REG/MAD_IFC commands layout and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/mad.c | 42 ++++++++++----------- drivers/net/ethernet/mellanox/mlx5/core/port.c | 52 +++++++++++++------------- include/linux/mlx5/device.h | 29 -------------- 3 files changed, 45 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c index 1368dac00da0..13e6afd52a9b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mad.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c @@ -39,36 +39,34 @@ int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, u16 opmod, u8 port) { - struct mlx5_mad_ifc_mbox_in *in = NULL; - struct mlx5_mad_ifc_mbox_out *out = NULL; - int err; + int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out); + int inlen = MLX5_ST_SZ_BYTES(mad_ifc_in); + int err = -ENOMEM; + void *data; + void *resp; + u32 *out; + u32 *in; - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) - return -ENOMEM; - - out = kzalloc(sizeof(*out), GFP_KERNEL); - if (!out) { - err = -ENOMEM; + in = kzalloc(inlen, GFP_KERNEL); + out = kzalloc(outlen, GFP_KERNEL); + if (!in || !out) goto out; - } - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MAD_IFC); - in->hdr.opmod = cpu_to_be16(opmod); - in->port = port; + MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC); + MLX5_SET(mad_ifc_in, in, op_mod, opmod); + MLX5_SET(mad_ifc_in, in, port, port); - memcpy(in->data, inb, sizeof(in->data)); + data = MLX5_ADDR_OF(mad_ifc_in, in, mad); + memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); - err = mlx5_cmd_exec(dev, in, sizeof(*in), out, sizeof(*out)); + err = mlx5_cmd_exec(dev, in, inlen, out, outlen); + err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) goto out; - if (out->hdr.status) { - err = mlx5_cmd_status_to_err(&out->hdr); - goto out; - } - - memcpy(outb, out->data, sizeof(out->data)); + resp = MLX5_ADDR_OF(mad_ifc_out, out, response_mad_packet); + memcpy(outb, resp, + MLX5_FLD_SZ_BYTES(mad_ifc_out, response_mad_packet)); out: kfree(out); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 752c08127138..e8324c2a8fc3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -38,45 +38,43 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, - u16 reg_num, int arg, int write) + u16 reg_id, int arg, int write) { - struct mlx5_access_reg_mbox_in *in = NULL; - struct mlx5_access_reg_mbox_out *out = NULL; + int outlen = MLX5_ST_SZ_BYTES(access_register_out) + size_out; + int inlen = MLX5_ST_SZ_BYTES(access_register_in) + size_in; int err = -ENOMEM; + u32 *out = NULL; + u32 *in = NULL; + void *data; - in = mlx5_vzalloc(sizeof(*in) + size_in); - if (!in) - return -ENOMEM; - - out = mlx5_vzalloc(sizeof(*out) + size_out); - if (!out) - goto ex1; - - memcpy(in->data, data_in, size_in); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ACCESS_REG); - in->hdr.opmod = cpu_to_be16(!write); - in->arg = cpu_to_be32(arg); - in->register_id = cpu_to_be16(reg_num); - err = mlx5_cmd_exec(dev, in, sizeof(*in) + size_in, out, - sizeof(*out) + size_out); - if (err) - goto ex2; + in = mlx5_vzalloc(inlen); + out = mlx5_vzalloc(outlen); + if (!in || !out) + goto out; - if (out->hdr.status) - err = mlx5_cmd_status_to_err(&out->hdr); + data = MLX5_ADDR_OF(access_register_in, in, register_data); + memcpy(data, data_in, size_in); - if (!err) - memcpy(data_out, out->data, size_out); + MLX5_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REG); + MLX5_SET(access_register_in, in, op_mod, !write); + MLX5_SET(access_register_in, in, argument, arg); + MLX5_SET(access_register_in, in, register_id, reg_id); + + err = mlx5_cmd_exec(dev, in, inlen, out, outlen); + err = err ? : mlx5_cmd_status_to_err_v2(out); + if (err) + goto out; + + data = MLX5_ADDR_OF(access_register_out, out, register_data); + memcpy(data_out, data, size_out); -ex2: +out: kvfree(out); -ex1: kvfree(in); return err; } EXPORT_SYMBOL_GPL(mlx5_core_access_reg); - struct mlx5_reg_pcap { u8 rsvd0; u8 port_num; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 6c343c0b77d2..9570c493b50f 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1165,35 +1165,6 @@ struct mlx5_dump_mkey_mbox_out { __be32 mkey; }; -struct mlx5_mad_ifc_mbox_in { - struct mlx5_inbox_hdr hdr; - __be16 remote_lid; - u8 rsvd0; - u8 port; - u8 rsvd1[4]; - u8 data[256]; -}; - -struct mlx5_mad_ifc_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - u8 data[256]; -}; - -struct mlx5_access_reg_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[2]; - __be16 register_id; - __be32 arg; - __be32 data[0]; -}; - -struct mlx5_access_reg_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - __be32 data[0]; -}; - #define MLX5_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) enum { -- cgit v1.2.3 From 20bb566bda7b3e62b67dbb1bd363be40b5ae81c3 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 17 Jul 2016 02:01:45 +0300 Subject: net/mlx5: MCG commands via mlx5 ifc Remove old representation of manually created MCG commands layout and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/mcg.c | 70 +++++++-------------------- include/linux/mlx5/mlx5_ifc.h | 2 +- 3 files changed, 21 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index d6e2a1cae19a..0d55e0f33f53 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -280,7 +280,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_DEALLOC_Q_COUNTER: case MLX5_CMD_OP_DEALLOC_PD: case MLX5_CMD_OP_DEALLOC_UAR: - case MLX5_CMD_OP_DETTACH_FROM_MCG: + case MLX5_CMD_OP_DETACH_FROM_MCG: case MLX5_CMD_OP_DEALLOC_XRCD: case MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN: case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT: @@ -490,7 +490,7 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(CONFIG_INT_MODERATION); MLX5_COMMAND_STR_CASE(ACCESS_REG); MLX5_COMMAND_STR_CASE(ATTACH_TO_MCG); - MLX5_COMMAND_STR_CASE(DETTACH_FROM_MCG); + MLX5_COMMAND_STR_CASE(DETACH_FROM_MCG); MLX5_COMMAND_STR_CASE(GET_DROPPED_PACKET_LOG); MLX5_COMMAND_STR_CASE(MAD_IFC); MLX5_COMMAND_STR_CASE(QUERY_MAD_DEMUX); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c index d5a0c2d61a18..01a1abd88203 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c @@ -37,70 +37,36 @@ #include #include "mlx5_core.h" -struct mlx5_attach_mcg_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - __be32 rsvd; - u8 gid[16]; -}; - -struct mlx5_attach_mcg_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvf[8]; -}; - -struct mlx5_detach_mcg_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - __be32 rsvd; - u8 gid[16]; -}; - -struct mlx5_detach_mcg_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvf[8]; -}; - int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) { - struct mlx5_attach_mcg_mbox_in in; - struct mlx5_attach_mcg_mbox_out out; + u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {0}; + void *gid; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ATTACH_TO_MCG); - memcpy(in.gid, mgid, sizeof(*mgid)); - in.qpn = cpu_to_be32(qpn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); + MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); + MLX5_SET(attach_to_mcg_in, in, qpn, qpn); + gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_attach_mcg); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) { - struct mlx5_detach_mcg_mbox_in in; - struct mlx5_detach_mcg_mbox_out out; + u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {0}; + void *gid; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DETTACH_FROM_MCG); - memcpy(in.gid, mgid, sizeof(*mgid)); - in.qpn = cpu_to_be32(qpn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); + MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + MLX5_SET(detach_from_mcg_in, in, qpn, qpn); + gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_detach_mcg); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 21bc4557b67a..3f70fc9c2fc9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -152,7 +152,7 @@ enum { MLX5_CMD_OP_CONFIG_INT_MODERATION = 0x804, MLX5_CMD_OP_ACCESS_REG = 0x805, MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, - MLX5_CMD_OP_DETTACH_FROM_MCG = 0x807, + MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, MLX5_CMD_OP_GET_DROPPED_PACKET_LOG = 0x80a, MLX5_CMD_OP_MAD_IFC = 0x50d, MLX5_CMD_OP_QUERY_MAD_DEMUX = 0x80b, -- cgit v1.2.3 From 73b626c182dff06867ceba996a819e8372c9b2ce Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 16 Jul 2016 03:26:15 +0300 Subject: net/mlx5: EQ commands via mlx5 ifc Remove old representation of manually created EQ commands layout, and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 18 +++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 78 ++++++++++------------- include/linux/mlx5/device.h | 74 --------------------- include/linux/mlx5/driver.h | 2 +- 4 files changed, 44 insertions(+), 128 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 5210d92e6bc7..58e5518ebb27 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -358,32 +358,32 @@ out: static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq, int index) { - struct mlx5_query_eq_mbox_out *out; - struct mlx5_eq_context *ctx; + int outlen = MLX5_ST_SZ_BYTES(query_eq_out); u64 param = 0; + void *ctx; + u32 *out; int err; - out = kzalloc(sizeof(*out), GFP_KERNEL); + out = kzalloc(outlen, GFP_KERNEL); if (!out) return param; - ctx = &out->ctx; - - err = mlx5_core_eq_query(dev, eq, out, sizeof(*out)); + err = mlx5_core_eq_query(dev, eq, out, outlen); if (err) { mlx5_core_warn(dev, "failed to query eq\n"); goto out; } + ctx = MLX5_ADDR_OF(query_eq_out, out, eq_context_entry); switch (index) { case EQ_NUM_EQES: - param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f); + param = 1 << MLX5_GET(eqc, ctx, log_eq_size); break; case EQ_INTR: - param = ctx->intr; + param = MLX5_GET(eqc, ctx, intr); break; case EQ_LOG_PG_SZ: - param = (ctx->log_page_size & 0x1f) + 12; + param = MLX5_GET(eqc, ctx, log_page_size) + 12; break; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 0e30602ef76d..71411976ef1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -86,23 +86,16 @@ struct cre_des_eq { static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn) { - struct mlx5_destroy_eq_mbox_in in; - struct mlx5_destroy_eq_mbox_out out; + u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_EQ); - in.eqn = eqn; - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (!err) - goto ex; + MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); + MLX5_SET(destroy_eq_in, in, eq_number, eqn); - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); -ex: - return err; } static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry) @@ -351,11 +344,13 @@ static void init_eq_buf(struct mlx5_eq *eq) int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int nent, u64 mask, const char *name, struct mlx5_uar *uar) { + u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0}; struct mlx5_priv *priv = &dev->priv; - struct mlx5_create_eq_mbox_in *in; - struct mlx5_create_eq_mbox_out out; - int err; + __be64 *pas; + void *eqc; int inlen; + u32 *in; + int err; eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE); eq->cons_index = 0; @@ -365,35 +360,37 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, init_eq_buf(eq); - inlen = sizeof(*in) + sizeof(in->pas[0]) * eq->buf.npages; + inlen = MLX5_ST_SZ_BYTES(create_eq_in) + + MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages; + in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto err_buf; } - memset(&out, 0, sizeof(out)); - mlx5_fill_page_array(&eq->buf, in->pas); + pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas); + mlx5_fill_page_array(&eq->buf, pas); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ); - in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index); - in->ctx.intr = vecidx; - in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; - in->events_mask = cpu_to_be64(mask); + MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ); + MLX5_SET64(create_eq_in, in, event_bitmask, mask); - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); - if (err) - goto err_in; + eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); + MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent)); + MLX5_SET(eqc, eqc, uar_page, uar->index); + MLX5_SET(eqc, eqc, intr, vecidx); + MLX5_SET(eqc, eqc, log_page_size, + eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); - if (out.hdr.status) { - err = mlx5_cmd_status_to_err(&out.hdr); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); + if (err) goto err_in; - } snprintf(priv->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s", name, pci_name(dev->pdev)); - eq->eqn = out.eq_number; + eq->eqn = MLX5_GET(create_eq_out, out, eq_number); eq->irqn = priv->msix_arr[vecidx].vector; eq->dev = dev; eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET; @@ -547,22 +544,15 @@ int mlx5_stop_eqs(struct mlx5_core_dev *dev) } int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, - struct mlx5_query_eq_mbox_out *out, int outlen) + u32 *out, int outlen) { - struct mlx5_query_eq_mbox_in in; + u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(out, 0, outlen); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_EQ); - in.eqn = eq->eqn; - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); - if (err) - return err; - - if (out->hdr.status) - err = mlx5_cmd_status_to_err(&out->hdr); + MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ); + MLX5_SET(query_eq_in, in, eq_number, eq->eqn); - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL_GPL(mlx5_core_eq_query); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9570c493b50f..c84e0ba5b261 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -995,80 +995,6 @@ struct mlx5_disable_hca_mbox_out { u8 rsvd[8]; }; -struct mlx5_eq_context { - u8 status; - u8 ec_oi; - u8 st; - u8 rsvd2[7]; - __be16 page_pffset; - __be32 log_sz_usr_page; - u8 rsvd3[7]; - u8 intr; - u8 log_page_size; - u8 rsvd4[15]; - __be32 consumer_counter; - __be32 produser_counter; - u8 rsvd5[16]; -}; - -struct mlx5_create_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[3]; - u8 input_eqn; - u8 rsvd1[4]; - struct mlx5_eq_context ctx; - u8 rsvd2[8]; - __be64 events_mask; - u8 rsvd3[176]; - __be64 pas[0]; -}; - -struct mlx5_create_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[3]; - u8 eq_number; - u8 rsvd1[4]; -}; - -struct mlx5_destroy_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[3]; - u8 eqn; - u8 rsvd1[4]; -}; - -struct mlx5_destroy_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_map_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be64 mask; - u8 mu; - u8 rsvd0[2]; - u8 eqn; - u8 rsvd1[24]; -}; - -struct mlx5_map_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[3]; - u8 eqn; - u8 rsvd1[4]; -}; - -struct mlx5_query_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - struct mlx5_eq_context ctx; -}; - enum { MLX5_MKEY_STATUS_FREE = 1 << 6, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ccea6fb16482..eed4b612572d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -865,7 +865,7 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, - struct mlx5_query_eq_mbox_out *out, int outlen); + u32 *out, int outlen); int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev); void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 278277866334e515141dde7c8ac143e15c0a767f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 16 Jul 2016 02:33:22 +0300 Subject: {net,IB}/mlx5: CQ commands via mlx5 ifc Remove old representation of manually created CQ commands layout, and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cq.c | 110 ++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 113 +++++++++------------- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 18 ++-- include/linux/mlx5/cq.h | 6 +- include/linux/mlx5/device.h | 76 --------------- 5 files changed, 122 insertions(+), 201 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 308a358e5b46..35a9f718e669 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -747,14 +747,16 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct ib_ucontext *context, struct mlx5_ib_cq *cq, - int entries, struct mlx5_create_cq_mbox_in **cqb, + int entries, u32 **cqb, int *cqe_size, int *index, int *inlen) { struct mlx5_ib_create_cq ucmd; size_t ucmdlen; int page_shift; + __be64 *pas; int npages; int ncont; + void *cqc; int err; ucmdlen = @@ -792,14 +794,20 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); - *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont; *cqb = mlx5_vzalloc(*inlen); if (!*cqb) { err = -ENOMEM; goto err_db; } - mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); - (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); *index = to_mucontext(context)->uuari.uars[0].index; @@ -834,9 +842,10 @@ static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, int entries, int cqe_size, - struct mlx5_create_cq_mbox_in **cqb, - int *index, int *inlen) + u32 **cqb, int *index, int *inlen) { + __be64 *pas; + void *cqc; int err; err = mlx5_db_alloc(dev->mdev, &cq->db); @@ -853,15 +862,21 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, init_cq_buf(cq, &cq->buf); - *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * cq->buf.buf.npages; *cqb = mlx5_vzalloc(*inlen); if (!*cqb) { err = -ENOMEM; goto err_buf; } - mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_fill_page_array(&cq->buf.buf, pas); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + *index = dev->mdev->priv.uuari.uars[0].index; return 0; @@ -895,11 +910,12 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, { int entries = attr->cqe; int vector = attr->comp_vector; - struct mlx5_create_cq_mbox_in *cqb = NULL; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_cq *cq; int uninitialized_var(index); int uninitialized_var(inlen); + u32 *cqb = NULL; + void *cqc; int cqe_size; unsigned int irqn; int eqn; @@ -945,19 +961,20 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } - cq->cqe_size = cqe_size; - cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; - - if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) - cqb->ctx.cqe_sz_flags |= (1 << 1); - - cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); if (err) goto err_cqb; - cqb->ctx.c_eqn = cpu_to_be16(eqn); - cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + cq->cqe_size = cqe_size; + + cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context); + MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + MLX5_SET(cqc, cqc, uar_page, index); + MLX5_SET(cqc, cqc, c_eqn, eqn); + MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); + if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) + MLX5_SET(cqc, cqc, oi, 1); err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); if (err) @@ -1088,27 +1105,15 @@ void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) { - struct mlx5_modify_cq_mbox_in *in; struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); int err; - u32 fsel; if (!MLX5_CAP_GEN(dev->mdev, cq_moderation)) return -ENOSYS; - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) - return -ENOMEM; - - in->cqn = cpu_to_be32(mcq->mcq.cqn); - fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); - in->ctx.cq_period = cpu_to_be16(cq_period); - in->ctx.cq_max_count = cpu_to_be16(cq_count); - in->field_select = cpu_to_be32(fsel); - err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); - kfree(in); - + err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq, + cq_period, cq_count); if (err) mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); @@ -1241,9 +1246,11 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibcq->device); struct mlx5_ib_cq *cq = to_mcq(ibcq); - struct mlx5_modify_cq_mbox_in *in; + void *cqc; + u32 *in; int err; int npas; + __be64 *pas; int page_shift; int inlen; int uninitialized_var(cqe_size); @@ -1285,28 +1292,37 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) if (err) goto ex; - inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + inlen = MLX5_ST_SZ_BYTES(modify_cq_in) + + MLX5_FLD_SZ_BYTES(modify_cq_in, pas[0]) * npas; + in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto ex_resize; } + pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas); if (udata) mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, - in->pas, 0); + pas, 0); else - mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); - - in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | - MLX5_MODIFY_CQ_MASK_PG_OFFSET | - MLX5_MODIFY_CQ_MASK_PG_SIZE); - in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; - in->ctx.page_offset = 0; - in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); - in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); - in->cqn = cpu_to_be32(cq->mcq.cqn); + mlx5_fill_page_array(&cq->resize_buf->buf, pas); + + MLX5_SET(modify_cq_in, in, + modify_field_select_resize_field_select.resize_field_select.resize_field_select, + MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + + cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context); + + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + + MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE); + MLX5_SET(modify_cq_in, in, cqn, cq->mcq.cqn); err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 873a631ad155..cf02d8a27874 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -134,33 +134,30 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type) complete(&cq->free); } - int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_create_cq_mbox_in *in, int inlen) + u32 *in, int inlen) { - int err; struct mlx5_cq_table *table = &dev->priv.cq_table; - struct mlx5_create_cq_mbox_out out; - struct mlx5_destroy_cq_mbox_in din; - struct mlx5_destroy_cq_mbox_out dout; + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + u32 din[MLX5_ST_SZ_DW(destroy_cq_in)]; + u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)]; int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn); struct mlx5_eq *eq; + int err; eq = mlx5_eqn2eq(dev, eqn); if (IS_ERR(eq)) return PTR_ERR(eq); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ); - memset(&out, 0, sizeof(out)); - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + memset(out, 0, sizeof(out)); + MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; - if (out.hdr.status) - return mlx5_cmd_status_to_err(&out.hdr); - - cq->cqn = be32_to_cpu(out.cqn) & 0xffffff; + cq->cqn = MLX5_GET(create_cq_out, out, cqn); cq->cons_index = 0; cq->arm_sn = 0; atomic_set(&cq->refcount, 1); @@ -186,19 +183,21 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, return 0; err_cmd: - memset(&din, 0, sizeof(din)); - memset(&dout, 0, sizeof(dout)); - din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ); - mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout)); - return err; + memset(din, 0, sizeof(din)); + memset(dout, 0, sizeof(dout)); + MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); + MLX5_SET(destroy_cq_in, din, cqn, cq->cqn); + err = mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); + return err ? : mlx5_cmd_status_to_err_v2(out); + } EXPORT_SYMBOL(mlx5_core_create_cq); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) { struct mlx5_cq_table *table = &dev->priv.cq_table; - struct mlx5_destroy_cq_mbox_in in; - struct mlx5_destroy_cq_mbox_out out; + u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0}; struct mlx5_core_cq *tmp; int err; @@ -214,17 +213,13 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) return -EINVAL; } - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ); - in.cqn = cpu_to_be32(cq->cqn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ); + MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; - if (out.hdr.status) - return mlx5_cmd_status_to_err(&out.hdr); - synchronize_irq(cq->irqn); mlx5_debug_cq_remove(dev, cq); @@ -237,44 +232,28 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) EXPORT_SYMBOL(mlx5_core_destroy_cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_query_cq_mbox_out *out) + u32 *out, int outlen) { - struct mlx5_query_cq_mbox_in in; + u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(out, 0, sizeof(*out)); - - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_CQ); - in.cqn = cpu_to_be32(cq->cqn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); - if (err) - return err; - - if (out->hdr.status) - return mlx5_cmd_status_to_err(&out->hdr); + MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ); + MLX5_SET(query_cq_in, in, cqn, cq->cqn); - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_query_cq); - int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_modify_cq_mbox_in *in, int in_sz) + u32 *in, int inlen) { - struct mlx5_modify_cq_mbox_out out; + u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0}; int err; - memset(&out, 0, sizeof(out)); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MODIFY_CQ); - err = mlx5_cmd_exec(dev, in, in_sz, &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - return mlx5_cmd_status_to_err(&out.hdr); - - return 0; + MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_modify_cq); @@ -283,18 +262,20 @@ int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, u16 cq_period, u16 cq_max_count) { - struct mlx5_modify_cq_mbox_in in; - - memset(&in, 0, sizeof(in)); - - in.cqn = cpu_to_be32(cq->cqn); - in.ctx.cq_period = cpu_to_be16(cq_period); - in.ctx.cq_max_count = cpu_to_be16(cq_max_count); - in.field_select = cpu_to_be32(MLX5_CQ_MODIFY_PERIOD | - MLX5_CQ_MODIFY_COUNT); - - return mlx5_core_modify_cq(dev, cq, &in, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {0}; + void *cqc; + + MLX5_SET(modify_cq_in, in, cqn, cq->cqn); + cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, cq_period, cq_period); + MLX5_SET(cqc, cqc, cq_max_count, cq_max_count); + MLX5_SET(modify_cq_in, in, + modify_field_select_resize_field_select.modify_field_select.modify_field_select, + MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + + return mlx5_core_modify_cq(dev, cq, in, sizeof(in)); } +EXPORT_SYMBOL(mlx5_core_modify_cq_moderation); int mlx5_init_cq_table(struct mlx5_core_dev *dev) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 58e5518ebb27..b7484e4128c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -395,37 +395,37 @@ out: static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, int index) { - struct mlx5_query_cq_mbox_out *out; - struct mlx5_cq_context *ctx; + int outlen = MLX5_ST_SZ_BYTES(query_cq_out); u64 param = 0; + void *ctx; + u32 *out; int err; - out = kzalloc(sizeof(*out), GFP_KERNEL); + out = mlx5_vzalloc(outlen); if (!out) return param; - ctx = &out->ctx; - - err = mlx5_core_query_cq(dev, cq, out); + err = mlx5_core_query_cq(dev, cq, out, outlen); if (err) { mlx5_core_warn(dev, "failed to query cq\n"); goto out; } + ctx = MLX5_ADDR_OF(query_cq_out, out, cq_context); switch (index) { case CQ_PID: param = cq->pid; break; case CQ_NUM_CQES: - param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f); + param = 1 << MLX5_GET(cqc, ctx, log_cq_size); break; case CQ_LOG_PG_SZ: - param = (ctx->log_pg_sz & 0x1f) + 12; + param = MLX5_GET(cqc, ctx, log_page_size); break; } out: - kfree(out); + kvfree(out); return param; } diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 2566f6d6444f..7c3c0d3aca37 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -170,12 +170,12 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd, int mlx5_init_cq_table(struct mlx5_core_dev *dev); void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_create_cq_mbox_in *in, int inlen); + u32 *in, int inlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_query_cq_mbox_out *out); + u32 *out, int outlen); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_modify_cq_mbox_in *in, int in_sz); + u32 *in, int inlen); int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u16 cq_period, u16 cq_max_count); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index c84e0ba5b261..5a1c1606bdbd 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -899,82 +899,6 @@ struct mlx5_arm_srq_mbox_out { u8 rsvd[8]; }; -struct mlx5_cq_context { - u8 status; - u8 cqe_sz_flags; - u8 st; - u8 rsvd3; - u8 rsvd4[6]; - __be16 page_offset; - __be32 log_sz_usr_page; - __be16 cq_period; - __be16 cq_max_count; - __be16 rsvd20; - __be16 c_eqn; - u8 log_pg_sz; - u8 rsvd25[7]; - __be32 last_notified_index; - __be32 solicit_producer_index; - __be32 consumer_counter; - __be32 producer_counter; - u8 rsvd48[8]; - __be64 db_record_addr; -}; - -struct mlx5_create_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_cqn; - u8 rsvdx[4]; - struct mlx5_cq_context ctx; - u8 rsvd6[192]; - __be64 pas[0]; -}; - -struct mlx5_create_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 cqn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 cqn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - -struct mlx5_query_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 cqn; - u8 rsvd0[4]; -}; - -struct mlx5_query_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; - struct mlx5_cq_context ctx; - u8 rsvd6[16]; - __be64 pas[0]; -}; - -struct mlx5_modify_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 cqn; - __be32 field_select; - struct mlx5_cq_context ctx; - u8 rsvd[192]; - __be64 pas[0]; -}; - -struct mlx5_modify_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - struct mlx5_enable_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd[8]; -- cgit v1.2.3 From ec22eb53106be1472ba6573dc900943f52f8fd1e Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 16 Jul 2016 06:28:36 +0300 Subject: {net,IB}/mlx5: MKey/PSV commands via mlx5 ifc Remove old representation of manually created MKey/PSV commands layout, and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 184 ++++++++++++--------- drivers/infiniband/hw/mlx5/qp.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 +- .../net/ethernet/mellanox/mlx5/core/en_common.c | 23 +-- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 37 ++--- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 183 +++++++++----------- include/linux/mlx5/device.h | 113 +------------ include/linux/mlx5/driver.h | 11 +- include/linux/mlx5/mlx5_ifc.h | 2 +- 10 files changed, 235 insertions(+), 332 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 372385d0f993..a59034aaa297 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -504,7 +504,7 @@ struct mlx5_ib_mr { int umred; int npages; struct mlx5_ib_dev *dev; - struct mlx5_create_mkey_mbox_out out; + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; int live; void *descs_alloc; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 4b021305c321..6f7e34753abc 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -135,20 +135,10 @@ static void reg_mr_callback(int status, void *context) return; } - if (mr->out.hdr.status) { - mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", - mr->out.hdr.status, - be32_to_cpu(mr->out.hdr.syndrome)); - kfree(mr); - dev->fill_delay = 1; - mod_timer(&dev->delay_timer, jiffies + HZ); - return; - } - spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); - mr->mmkey.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key; cache->last_add = jiffies; @@ -170,16 +160,19 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; - struct mlx5_create_mkey_mbox_in *in; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; int npages = 1 << ent->order; + void *mkc; + u32 *in; int err = 0; int i; - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); for (i = 0; i < num; i++) { if (ent->pending >= MAX_PENDING_REG_MR) { err = -EAGAIN; @@ -194,18 +187,22 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) mr->order = ent->order; mr->umred = 1; mr->dev = dev; - in->seg.status = MLX5_MKEY_STATUS_FREE; - in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; - in->seg.log2_page_size = 12; + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2); + MLX5_SET(mkc, mkc, log_page_size, 12); spin_lock_irq(&ent->lock); ent->pending++; spin_unlock_irq(&ent->lock); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, - sizeof(*in), reg_mr_callback, - mr, &mr->out); + err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey, + in, inlen, + mr->out, sizeof(mr->out), + reg_mr_callback, mr); if (err) { spin_lock_irq(&ent->lock); ent->pending--; @@ -670,30 +667,38 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_create_mkey_mbox_in *in; - struct mlx5_mkey_seg *seg; struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } - seg = &in->seg; - seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; - seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - seg->start_addr = 0; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); - err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, - NULL); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, 0); + + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen); if (err) goto err_in; @@ -1063,9 +1068,11 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, int page_shift, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_create_mkey_mbox_in *in; struct mlx5_ib_mr *mr; + __be64 *pas; + void *mkc; int inlen; + u32 *in; int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); @@ -1073,31 +1080,41 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, if (!mr) return ERR_PTR(-ENOMEM); - inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(*pas) * ((npages + 1) / 2) * 2; in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto err_1; } - mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, pg_cap ? MLX5_IB_MTT_PRESENT : 0); - /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + /* The pg_access bit allows setting the access flags * in the page list submitted with the command. */ - in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; - in->seg.flags = convert_access(access_flags) | - MLX5_ACCESS_MODE_MTT; - in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); - in->seg.start_addr = cpu_to_be64(virt_addr); - in->seg.len = cpu_to_be64(length); - in->seg.bsfs_octo_size = 0; - in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - in->seg.log2_page_size = page_shift; - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, - 1 << page_shift)); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen, NULL, - NULL, NULL); + MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET64(mkc, mkc, start_addr, virt_addr); + MLX5_SET64(mkc, mkc, len, length); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, bsf_octword_size, 0); + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(virt_addr, length, 1 << page_shift)); + MLX5_SET(mkc, mkc, log_page_size, page_shift); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + get_octo_len(virt_addr, length, 1 << page_shift)); + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; @@ -1523,30 +1540,32 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, u32 max_num_sg) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_create_mkey_mbox_in *in; - struct mlx5_ib_mr *mr; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); int ndescs = ALIGN(max_num_sg, 4); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } - in->seg.status = MLX5_MKEY_STATUS_FREE; - in->seg.xlt_oct_size = cpu_to_be32(ndescs); - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); if (mr_type == IB_MR_TYPE_MEM_REG) { - mr->access_mode = MLX5_ACCESS_MODE_MTT; - in->seg.log2_page_size = PAGE_SHIFT; - + mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, sizeof(u64)); if (err) @@ -1555,7 +1574,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, mr->desc_size = sizeof(u64); mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SG_GAPS) { - mr->access_mode = MLX5_ACCESS_MODE_KLM; + mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, sizeof(struct mlx5_klm)); @@ -1566,9 +1585,8 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, } else if (mr_type == IB_MR_TYPE_SIGNATURE) { u32 psv_index[2]; - in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | - MLX5_MKEY_BSF_EN); - in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + MLX5_SET(mkc, mkc, bsf_en, 1); + MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); if (!mr->sig) { err = -ENOMEM; @@ -1581,7 +1599,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, if (err) goto err_free_sig; - mr->access_mode = MLX5_ACCESS_MODE_KLM; + mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; mr->sig->psv_memory.psv_idx = psv_index[0]; mr->sig->psv_wire.psv_idx = psv_index[1]; @@ -1595,9 +1613,10 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free_in; } - in->seg.flags = MLX5_PERM_UMR_EN | mr->access_mode; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), - NULL, NULL, NULL); + MLX5_SET(mkc, mkc, access_mode, mr->access_mode); + MLX5_SET(mkc, mkc, umr_en, 1); + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) goto err_destroy_psv; @@ -1633,8 +1652,10 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_create_mkey_mbox_in *in = NULL; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mw *mw = NULL; + u32 *in = NULL; + void *mkc; int ndescs; int err; struct mlx5_ib_alloc_mw req = {}; @@ -1658,23 +1679,24 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); mw = kzalloc(sizeof(*mw), GFP_KERNEL); - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!mw || !in) { err = -ENOMEM; goto free; } - in->seg.status = MLX5_MKEY_STATUS_FREE; - in->seg.xlt_oct_size = cpu_to_be32(ndescs); - in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); - in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_KLM | - MLX5_PERM_LOCAL_READ; - if (type == IB_MW_TYPE_2) - in->seg.flags_pd |= cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - - err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, sizeof(*in), - NULL, NULL, NULL); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_KLMS); + MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2))); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen); if (err) goto free; @@ -1811,7 +1833,7 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, mr->desc_size * mr->max_descs, DMA_TO_DEVICE); - if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); else n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 0dd7d93cac95..21ab0e26fa71 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2968,7 +2968,7 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, memset(umr, 0, sizeof(*umr)); - if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) /* KLMs take twice the size of MTTs */ ndescs *= 2; @@ -3111,9 +3111,9 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, memset(seg, 0, sizeof(*seg)); - if (mr->access_mode == MLX5_ACCESS_MODE_MTT) + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT) seg->log2_page_size = ilog2(mr->ibmr.page_size); - else if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) /* KLMs take twice the size of MTTs */ ndescs *= 2; @@ -3454,7 +3454,7 @@ static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, memset(seg, 0, sizeof(*seg)); seg->flags = get_umr_flags(wr->access_flags) | - MLX5_ACCESS_MODE_KLM; + MLX5_MKC_ACCESS_MODE_KLMS; seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 0d55e0f33f53..88b05400fbc8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1301,10 +1301,12 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) callback = ent->callback; context = ent->context; err = ent->ret; - if (!err) + if (!err) { err = mlx5_copy_from_msg(ent->uout, ent->out, ent->uout_size); + err = err ? err : mlx5_cmd_status_to_err_v2(ent->uout); + } mlx5_free_cmd_msg(dev, ent->out); free_msg(dev, ent->in); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 673043ccd76c..3247c3c543a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -60,24 +60,27 @@ void mlx5e_destroy_tir(struct mlx5_core_dev *mdev, static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey) { - struct mlx5_create_mkey_mbox_in *in; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; int err; - in = mlx5_vzalloc(sizeof(*in)); + in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; - in->seg.flags = MLX5_PERM_LOCAL_WRITE | - MLX5_PERM_LOCAL_READ | - MLX5_ACCESS_MODE_PA; - in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); - err = mlx5_core_create_mkey(mdev, mkey, in, sizeof(*in), NULL, NULL, - NULL); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); - kvfree(in); + err = mlx5_core_create_mkey(mdev, mkey, in, inlen); + kvfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 870bea37c57c..98fb0d8ef2d2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3228,35 +3228,34 @@ static void mlx5e_destroy_q_counter(struct mlx5e_priv *priv) static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; - struct mlx5_create_mkey_mbox_in *in; - struct mlx5_mkey_seg *mkc; - int inlen = sizeof(*in); - u64 npages = - priv->profile->max_nch(mdev) * MLX5_CHANNEL_MAX_NUM_MTTS; + u64 npages = priv->profile->max_nch(mdev) * MLX5_CHANNEL_MAX_NUM_MTTS; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; int err; in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; - mkc = &in->seg; - mkc->status = MLX5_MKEY_STATUS_FREE; - mkc->flags = MLX5_PERM_UMR_EN | - MLX5_PERM_LOCAL_READ | - MLX5_PERM_LOCAL_WRITE | - MLX5_ACCESS_MODE_MTT; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - mkc->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - mkc->flags_pd = cpu_to_be32(mdev->mlx5e_res.pdn); - mkc->len = cpu_to_be64(npages << PAGE_SHIFT); - mkc->xlt_oct_size = cpu_to_be32(mlx5e_get_mtt_octw(npages)); - mkc->log2_page_size = PAGE_SHIFT; + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); - err = mlx5_core_create_mkey(mdev, &priv->umr_mkey, in, inlen, NULL, - NULL, NULL); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn); + MLX5_SET64(mkc, mkc, len, npages << PAGE_SHIFT); + MLX5_SET(mkc, mkc, translations_octword_size, + mlx5e_get_mtt_octw(npages)); + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); - kvfree(in); + err = mlx5_core_create_mkey(mdev, &priv->umr_mkey, in, inlen); + kvfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 77a7293921d5..0a7b743caafe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -49,48 +49,44 @@ void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev) { } -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, - struct mlx5_core_mkey *mkey, - struct mlx5_create_mkey_mbox_in *in, int inlen, - mlx5_cmd_cbk_t callback, void *context, - struct mlx5_create_mkey_mbox_out *out) +int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, + u32 *in, int inlen, + u32 *out, int outlen, + mlx5_cmd_cbk_t callback, void *context) { struct mlx5_mkey_table *table = &dev->priv.mkey_table; - struct mlx5_create_mkey_mbox_out lout; + u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0}; + u32 mkey_index; + void *mkc; int err; u8 key; - memset(&lout, 0, sizeof(lout)); spin_lock_irq(&dev->priv.mkey_lock); key = dev->priv.mkey_key++; spin_unlock_irq(&dev->priv.mkey_lock); - in->seg.qpn_mkey7_0 |= cpu_to_be32(key); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY); - if (callback) { - err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out), - callback, context); - return err; - } else { - err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout)); - } + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - if (err) { - mlx5_core_dbg(dev, "cmd exec failed %d\n", err); - return err; - } + MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + MLX5_SET(mkc, mkc, mkey_7_0, key); - if (lout.hdr.status) { - mlx5_core_dbg(dev, "status %d\n", lout.hdr.status); - return mlx5_cmd_status_to_err(&lout.hdr); - } + if (callback) + return mlx5_cmd_exec_cb(dev, in, inlen, out, outlen, + callback, context); + + err = mlx5_cmd_exec(dev, in, inlen, lout, sizeof(lout)); + err = err ? : mlx5_cmd_status_to_err_v2(lout); + if (err) + return err; - mkey->iova = be64_to_cpu(in->seg.start_addr); - mkey->size = be64_to_cpu(in->seg.len); - mkey->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; - mkey->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; + mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index); + mkey->iova = MLX5_GET64(mkc, mkc, start_addr); + mkey->size = MLX5_GET64(mkc, mkc, len); + mkey->key = mlx5_idx_to_mkey(mkey_index) | key; + mkey->pd = MLX5_GET(mkc, mkc, pd); mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", - be32_to_cpu(lout.mkey), key, mkey->key); + mkey_index, key, mkey->key); /* connect to mkey tree */ write_lock_irq(&table->lock); @@ -104,21 +100,27 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, return err; } +EXPORT_SYMBOL(mlx5_core_create_mkey_cb); + +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, + u32 *in, int inlen) +{ + return mlx5_core_create_mkey_cb(dev, mkey, in, inlen, + NULL, 0, NULL, NULL); +} EXPORT_SYMBOL(mlx5_core_create_mkey); int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey) { struct mlx5_mkey_table *table = &dev->priv.mkey_table; - struct mlx5_destroy_mkey_mbox_in in; - struct mlx5_destroy_mkey_mbox_out out; + u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0}; struct mlx5_core_mkey *deleted_mkey; unsigned long flags; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - write_lock_irqsave(&table->lock, flags); deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key)); write_unlock_irqrestore(&table->lock, flags); @@ -128,94 +130,81 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, return -ENOENT; } - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; + MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); + MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); - if (out.hdr.status) - return mlx5_cmd_status_to_err(&out.hdr); - - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_destroy_mkey); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, - struct mlx5_query_mkey_mbox_out *out, int outlen) + u32 *out, int outlen) { - struct mlx5_query_mkey_mbox_in in; + u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); memset(out, 0, outlen); + MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY); + MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); - if (err) - return err; - - if (out->hdr.status) - return mlx5_cmd_status_to_err(&out->hdr); - - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_query_mkey); int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey) { - struct mlx5_query_special_ctxs_mbox_in in; - struct mlx5_query_special_ctxs_mbox_out out; + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; - if (out.hdr.status) - return mlx5_cmd_status_to_err(&out.hdr); - - *mkey = be32_to_cpu(out.dump_fill_mkey); - + *mkey = MLX5_GET(query_special_contexts_out, out, dump_fill_mkey); return err; } EXPORT_SYMBOL(mlx5_core_dump_fill_mkey); +static inline u32 mlx5_get_psv(u32 *out, int psv_index) +{ + switch (psv_index) { + case 1: return MLX5_GET(create_psv_out, out, psv1_index); + case 2: return MLX5_GET(create_psv_out, out, psv2_index); + case 3: return MLX5_GET(create_psv_out, out, psv3_index); + default: return MLX5_GET(create_psv_out, out, psv0_index); + } +} + int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index) { - struct mlx5_allocate_psv_in in; - struct mlx5_allocate_psv_out out; + u32 out[MLX5_ST_SZ_DW(create_psv_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_psv_in)] = {0}; int i, err; if (npsvs > MLX5_MAX_PSVS) return -EINVAL; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); + MLX5_SET(create_psv_in, in, opcode, MLX5_CMD_OP_CREATE_PSV); + MLX5_SET(create_psv_in, in, pd, pdn); + MLX5_SET(create_psv_in, in, num_psv, npsvs); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_PSV); - in.npsv_pd = cpu_to_be32((npsvs << 28) | pdn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) { - mlx5_core_err(dev, "cmd exec failed %d\n", err); + mlx5_core_err(dev, "create_psv cmd exec failed %d\n", err); return err; } - if (out.hdr.status) { - mlx5_core_err(dev, "create_psv bad status %d\n", - out.hdr.status); - return mlx5_cmd_status_to_err(&out.hdr); - } - for (i = 0; i < npsvs; i++) - sig_index[i] = be32_to_cpu(out.psv_idx[i]) & 0xffffff; + sig_index[i] = mlx5_get_psv(out, i); return err; } @@ -223,29 +212,13 @@ EXPORT_SYMBOL(mlx5_core_create_psv); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) { - struct mlx5_destroy_psv_in in; - struct mlx5_destroy_psv_out out; + u32 out[MLX5_ST_SZ_DW(destroy_psv_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_psv_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - - in.psv_number = cpu_to_be32(psv_num); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_PSV); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) { - mlx5_core_err(dev, "destroy_psv cmd exec failed %d\n", err); - goto out; - } - - if (out.hdr.status) { - mlx5_core_err(dev, "destroy_psv bad status %d\n", - out.hdr.status); - err = mlx5_cmd_status_to_err(&out.hdr); - goto out; - } - -out: - return err; + MLX5_SET(destroy_psv_in, in, opcode, MLX5_CMD_OP_DESTROY_PSV); + MLX5_SET(destroy_psv_in, in, psvn, psv_num); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL(mlx5_core_destroy_psv); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5a1c1606bdbd..fb002db1e2f0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -197,19 +197,6 @@ enum { MLX5_PCIE_CTRL_TPH_MASK = 3 << 4, }; -enum { - MLX5_ACCESS_MODE_PA = 0, - MLX5_ACCESS_MODE_MTT = 1, - MLX5_ACCESS_MODE_KLM = 2 -}; - -enum { - MLX5_MKEY_REMOTE_INVAL = 1 << 24, - MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29, - MLX5_MKEY_BSF_EN = 1 << 30, - MLX5_MKEY_LEN64 = 1 << 31, -}; - enum { MLX5_EN_RD = (u64)1, MLX5_EN_WR = (u64)2 @@ -923,6 +910,13 @@ enum { MLX5_MKEY_STATUS_FREE = 1 << 6, }; +enum { + MLX5_MKEY_REMOTE_INVAL = 1 << 24, + MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29, + MLX5_MKEY_BSF_EN = 1 << 30, + MLX5_MKEY_LEN64 = 1 << 31, +}; + struct mlx5_mkey_seg { /* This is a two bit field occupying bits 31-30. * bit 31 is always 0, @@ -945,105 +939,12 @@ struct mlx5_mkey_seg { u8 rsvd4[4]; }; -struct mlx5_query_special_ctxs_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_special_ctxs_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 dump_fill_mkey; - __be32 reserved_lkey; -}; - -struct mlx5_create_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_mkey_index; - __be32 flags; - struct mlx5_mkey_seg seg; - u8 rsvd1[16]; - __be32 xlat_oct_act_size; - __be32 rsvd2; - u8 rsvd3[168]; - __be64 pas[0]; -}; - -struct mlx5_create_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 mkey; - u8 rsvd[4]; -}; - -struct mlx5_destroy_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 mkey; - u8 rsvd[4]; -}; - -struct mlx5_destroy_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 mkey; -}; - -struct mlx5_query_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - __be64 pas[0]; -}; - -struct mlx5_modify_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 mkey; - __be64 pas[0]; -}; - -struct mlx5_modify_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_dump_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; -}; - -struct mlx5_dump_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 mkey; -}; - #define MLX5_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) enum { MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO = 1 << 0 }; -struct mlx5_allocate_psv_in { - struct mlx5_inbox_hdr hdr; - __be32 npsv_pd; - __be32 rsvd_psv0; -}; - -struct mlx5_allocate_psv_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - __be32 psv_idx[4]; -}; - -struct mlx5_destroy_psv_in { - struct mlx5_inbox_hdr hdr; - __be32 psv_number; - u8 rsvd[4]; -}; - -struct mlx5_destroy_psv_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - enum { VPORT_STATE_DOWN = 0x0, VPORT_STATE_UP = 0x1, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index eed4b612572d..173817187abb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -807,15 +807,18 @@ int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); void mlx5_init_mkey_table(struct mlx5_core_dev *dev); void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); +int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, + u32 *in, int inlen, + u32 *out, int outlen, + mlx5_cmd_cbk_t callback, void *context); int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, - struct mlx5_create_mkey_mbox_in *in, int inlen, - mlx5_cmd_cbk_t callback, void *context, - struct mlx5_create_mkey_mbox_out *out); + u32 *in, int inlen); int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, - struct mlx5_query_mkey_mbox_out *out, int outlen); + u32 *out, int outlen); int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3f70fc9c2fc9..2a39a06dbad4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3489,7 +3489,7 @@ struct mlx5_ifc_query_special_contexts_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x20]; + u8 dump_fill_mkey[0x20]; u8 resd_lkey[0x20]; }; -- cgit v1.2.3 From e79c6a4fc923eed2bdd3b716e0f01414847db90a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 10 Aug 2016 14:36:02 -0700 Subject: net: make net namespace sysctls belong to container's owner If net namespace is attached to a user namespace let's make container's root owner of sysctls affecting said network namespace instead of global root. This also allows us to clean up net_ctl_permissions() because we do not need to fudge permissions anymore for the container's owner since it now owns the objects in question. Acked-by: "Eric W. Biederman" Signed-off-by: Dmitry Torokhov Signed-off-by: David S. Miller --- fs/proc/proc_sysctl.c | 5 +++++ include/linux/sysctl.h | 4 ++++ net/sysctl_net.c | 29 ++++++++++++++++++++--------- 3 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b59db94d2ff4..62d8c6975d34 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { + struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; @@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (is_empty_dir(head)) make_empty_dir_inode(inode); } + + if (root->set_ownership) + root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + out: return inode; } diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 697e160c78d0..d82cb6011e77 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -25,6 +25,7 @@ #include #include #include +#include #include /* For the /proc/sys support */ @@ -157,6 +158,9 @@ struct ctl_table_root { struct ctl_table_set default_set; struct ctl_table_set *(*lookup)(struct ctl_table_root *root, struct nsproxy *namespaces); + void (*set_ownership)(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid); int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); }; diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 46a71c701e7c..5bc1a3d57401 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -42,26 +42,37 @@ static int net_ctl_permissions(struct ctl_table_header *head, struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); - kuid_t root_uid = make_kuid(net->user_ns, 0); - kgid_t root_gid = make_kgid(net->user_ns, 0); /* Allow network administrator to have same access as root. */ - if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) || - uid_eq(root_uid, current_euid())) { + if (ns_capable(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } - /* Allow netns root group to have the same access as the root group */ - if (in_egroup_p(root_gid)) { - int mode = (table->mode >> 3) & 7; - return (mode << 3) | mode; - } + return table->mode; } +static void net_ctl_set_ownership(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid) +{ + struct net *net = container_of(head->set, struct net, sysctls); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(net->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(net->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + static struct ctl_table_root net_sysctl_root = { .lookup = net_ctl_header_lookup, .permissions = net_ctl_permissions, + .set_ownership = net_ctl_set_ownership, }; static int __net_init sysctl_net_init(struct net *net) -- cgit v1.2.3 From 03459345bc00da70a35fa39bcfcf13d779097074 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Sat, 13 Aug 2016 00:30:48 +0800 Subject: pptp: Refactor the struct and macros of PPTP codes 1. Use struct gre_base_hdr directly in pptp_gre_header instead of duplicated members; 2. Use existing macros like GRE_KEY, GRE_SEQ, and so on instead of duplicated macros defined by PPTP; 3. Add new macros like GRE_IS_ACK/SEQ and so on instead of PPTP_GRE_IS_A/S and so on; Signed-off-by: Gao Feng Reviewed-by: Philip Prindeville Signed-off-by: David S. Miller --- drivers/net/ppp/pptp.c | 28 +++++++++++++--------------- include/net/pptp.h | 19 +------------------ include/uapi/linux/if_tunnel.h | 12 ++++++++++-- 3 files changed, 24 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 3e68dbc0af7f..1951b1085cb8 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -206,16 +206,14 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) skb_push(skb, header_len); hdr = (struct pptp_gre_header *)(skb->data); - hdr->flags = PPTP_GRE_FLAG_K; - hdr->ver = PPTP_GRE_VER; - hdr->protocol = htons(PPTP_GRE_PROTO); - hdr->call_id = htons(opt->dst_addr.call_id); + hdr->gre_hd.flags = GRE_KEY | GRE_VERSION_1 | GRE_SEQ; + hdr->gre_hd.protocol = GRE_PROTO_PPP; + hdr->call_id = htons(opt->dst_addr.call_id); - hdr->flags |= PPTP_GRE_FLAG_S; - hdr->seq = htonl(++opt->seq_sent); + hdr->seq = htonl(++opt->seq_sent); if (opt->ack_sent != seq_recv) { /* send ack with this message */ - hdr->ver |= PPTP_GRE_FLAG_A; + hdr->gre_hd.flags |= GRE_ACK; hdr->ack = htonl(seq_recv); opt->ack_sent = seq_recv; } @@ -278,7 +276,7 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) headersize = sizeof(*header); /* test if acknowledgement present */ - if (PPTP_GRE_IS_A(header->ver)) { + if (GRE_IS_ACK(header->gre_hd.flags)) { __u32 ack; if (!pskb_may_pull(skb, headersize)) @@ -286,7 +284,7 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) header = (struct pptp_gre_header *)(skb->data); /* ack in different place if S = 0 */ - ack = PPTP_GRE_IS_S(header->flags) ? header->ack : header->seq; + ack = GRE_IS_SEQ(header->gre_hd.flags) ? header->ack : header->seq; ack = ntohl(ack); @@ -299,7 +297,7 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) headersize -= sizeof(header->ack); } /* test if payload present */ - if (!PPTP_GRE_IS_S(header->flags)) + if (!GRE_IS_SEQ(header->gre_hd.flags)) goto drop; payload_len = ntohs(header->payload_len); @@ -360,11 +358,11 @@ static int pptp_rcv(struct sk_buff *skb) header = (struct pptp_gre_header *)skb->data; - if (ntohs(header->protocol) != PPTP_GRE_PROTO || /* PPTP-GRE protocol for PPTP */ - PPTP_GRE_IS_C(header->flags) || /* flag C should be clear */ - PPTP_GRE_IS_R(header->flags) || /* flag R should be clear */ - !PPTP_GRE_IS_K(header->flags) || /* flag K should be set */ - (header->flags&0xF) != 0) /* routing and recursion ctrl = 0 */ + if (header->gre_hd.protocol != GRE_PROTO_PPP || /* PPTP-GRE protocol for PPTP */ + GRE_IS_CSUM(header->gre_hd.flags) || /* flag CSUM should be clear */ + GRE_IS_ROUTING(header->gre_hd.flags) || /* flag ROUTING should be clear */ + !GRE_IS_KEY(header->gre_hd.flags) || /* flag KEY should be set */ + (header->gre_hd.flags & GRE_FLAGS)) /* flag Recursion Ctrl should be clear */ /* if invalid, discard this packet */ goto drop; diff --git a/include/net/pptp.h b/include/net/pptp.h index 301d3e2ba1f9..92e9f1fe2628 100644 --- a/include/net/pptp.h +++ b/include/net/pptp.h @@ -10,26 +10,9 @@ ((((curseq) & 0xffffff00) == 0) &&\ (((lastseq) & 0xffffff00) == 0xffffff00)) -#define PPTP_GRE_PROTO 0x880B -#define PPTP_GRE_VER 0x1 - -#define PPTP_GRE_FLAG_C 0x80 -#define PPTP_GRE_FLAG_R 0x40 -#define PPTP_GRE_FLAG_K 0x20 -#define PPTP_GRE_FLAG_S 0x10 -#define PPTP_GRE_FLAG_A 0x80 - -#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C) -#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R) -#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K) -#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S) -#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A) - #define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header)) struct pptp_gre_header { - u8 flags; - u8 ver; - __be16 protocol; + struct gre_base_hdr gre_hd; __be16 payload_len; __be16 call_id; __be32 seq; diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 60dbb200de60..361b9f0f20d7 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -28,8 +28,16 @@ #define GRE_FLAGS __cpu_to_be16(0x0078) #define GRE_VERSION __cpu_to_be16(0x0007) -#define GRE_VERSION_1 __cpu_to_be16(0x0001) -#define GRE_PROTO_PPP __cpu_to_be16(0x880b) +#define GRE_IS_CSUM(f) ((f) & GRE_CSUM) +#define GRE_IS_ROUTING(f) ((f) & GRE_ROUTING) +#define GRE_IS_KEY(f) ((f) & GRE_KEY) +#define GRE_IS_SEQ(f) ((f) & GRE_SEQ) +#define GRE_IS_STRICT(f) ((f) & GRE_STRICT) +#define GRE_IS_REC(f) ((f) & GRE_REC) +#define GRE_IS_ACK(f) ((f) & GRE_ACK) + +#define GRE_VERSION_1 __cpu_to_be16(0x0001) +#define GRE_PROTO_PPP __cpu_to_be16(0x880b) #define GRE_PPTP_KEY_MASK __cpu_to_be32(0xffff) struct ip_tunnel_parm { -- cgit v1.2.3 From 09a7d9eca1a6cf5eb4f9abfdf8914db9dbd96f08 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 19 Jul 2016 01:17:59 +0300 Subject: {net,IB}/mlx5: QP/XRCD commands via mlx5 ifc Remove old representation of manually created QP/XRCD commands layout amd use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 154 +++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 14 +- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 167 ++++++++-------------- include/linux/mlx5/mlx5_ifc.h | 5 +- include/linux/mlx5/qp.h | 108 +------------- 5 files changed, 165 insertions(+), 283 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 21ab0e26fa71..d22492ff863e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -726,7 +726,7 @@ err_umem: static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, struct ib_qp_init_attr *attr, - struct mlx5_create_qp_mbox_in **in, + u32 **in, struct mlx5_ib_create_qp_resp *resp, int *inlen, struct mlx5_ib_qp_base *base) { @@ -739,6 +739,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, u32 offset = 0; int uuarn; int ncont = 0; + __be64 *pas; + void *qpc; int err; err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); @@ -795,20 +797,24 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, ubuffer->umem = NULL; } - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_umem; } + + pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); if (ubuffer->umem) - mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, - (*in)->pas, 0); - (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); - (*in)->ctx.params2 = cpu_to_be32(offset << 6); + mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0); + + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + + MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, page_offset, offset); - (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + MLX5_SET(qpc, qpc, uar_page, uar_index); resp->uuar_index = uuarn; qp->uuarn = uuarn; @@ -857,12 +863,13 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp, static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, - struct mlx5_create_qp_mbox_in **in, int *inlen, + u32 **in, int *inlen, struct mlx5_ib_qp_base *base) { enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; struct mlx5_uuar_info *uuari; int uar_index; + void *qpc; int uuarn; int err; @@ -902,25 +909,29 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, } qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_buf; } - (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + MLX5_SET(qpc, qpc, uar_page, uar_index); + MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + /* Set "fast registration enabled" for all kernel QPs */ - (*in)->ctx.params1 |= cpu_to_be32(1 << 11); - (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + MLX5_SET(qpc, qpc, fre, 1); + MLX5_SET(qpc, qpc, rlky, 1); if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) { - (*in)->ctx.deth_sqpn = cpu_to_be32(1); + MLX5_SET(qpc, qpc, deth_sqpn, 1); qp->flags |= MLX5_IB_QP_SQPN_QP1; } - mlx5_fill_page_array(&qp->buf, (*in)->pas); + mlx5_fill_page_array(&qp->buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas)); err = mlx5_db_alloc(dev->mdev, &qp->db); if (err) { @@ -974,15 +985,15 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); } -static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || (attr->qp_type == IB_QPT_XRC_INI)) - return cpu_to_be32(MLX5_SRQ_RQ); + return MLX5_SRQ_RQ; else if (!qp->has_rq) - return cpu_to_be32(MLX5_ZERO_LEN_RQ); + return MLX5_ZERO_LEN_RQ; else - return cpu_to_be32(MLX5_NON_ZERO_RQ); + return MLX5_NON_ZERO_RQ; } static int is_connected(enum ib_qp_type qp_type) @@ -1191,7 +1202,7 @@ static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, } static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct mlx5_create_qp_mbox_in *in, + u32 *in, struct ib_pd *pd) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; @@ -1461,18 +1472,18 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_qp_base *base; struct mlx5_ib_create_qp_resp resp; - struct mlx5_create_qp_mbox_in *in; - struct mlx5_ib_create_qp ucmd; struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; - int inlen = sizeof(*in); - int err; u32 uidx = MLX5_IB_DEFAULT_UIDX; + struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_qp_base *base; void *qpc; + u32 *in; + int err; base = init_attr->qp_type == IB_QPT_RAW_PACKET ? &qp->raw_packet_qp.rq.base : @@ -1600,7 +1611,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (err) return err; } else { - in = mlx5_vzalloc(sizeof(*in)); + in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; @@ -1610,26 +1621,29 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; - in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 | - MLX5_QP_PM_MIGRATED << 11); + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, to_mlx5_st(init_attr->qp_type)); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) - in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); else - in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE); + MLX5_SET(qpc, qpc, latency_sensitive, 1); + if (qp->wq_sig) - in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + MLX5_SET(qpc, qpc, wq_signature, 1); if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) - in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + MLX5_SET(qpc, qpc, block_lb_mc, 1); if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) - in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_MASTER); + MLX5_SET(qpc, qpc, cd_master, 1); if (qp->flags & MLX5_IB_QP_MANAGED_SEND) - in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_SEND); + MLX5_SET(qpc, qpc, cd_slave_send, 1); if (qp->flags & MLX5_IB_QP_MANAGED_RECV) - in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_RECV); + MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; @@ -1639,71 +1653,68 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); if (rcqe_sz == 128) - in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); else - in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { if (scqe_sz == 128) - in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); else - in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); } } if (qp->rq.wqe_cnt) { - in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); - in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); } - in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr)); if (qp->sq.wqe_cnt) - in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); else - in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + MLX5_SET(qpc, qpc, no_sq, 1); /* Set default resources */ switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: - in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); - in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); - in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn); break; case IB_QPT_XRC_INI: - in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); - in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); break; default: if (init_attr->srq) { - in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(init_attr->srq)->msrq.srqn); } else { - in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); - in->ctx.rq_type_srqn |= - cpu_to_be32(to_msrq(devr->s1)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s1)->msrq.srqn); } } if (init_attr->send_cq) - in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn); if (init_attr->recv_cq) - in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn); - in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); - if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) { - qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); - /* 0xffffff means we ask to work with cqe version 0 */ + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) MLX5_SET(qpc, qpc, user_index, uidx); - } + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ if (init_attr->qp_type == IB_QPT_UD && (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { - qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); qp->flags |= MLX5_IB_QP_LSO; } @@ -4320,21 +4331,24 @@ static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_attr *qp_attr) { - struct mlx5_query_qp_mbox_out *outb; + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); struct mlx5_qp_context *context; int mlx5_state; + u32 *outb; int err = 0; - outb = kzalloc(sizeof(*outb), GFP_KERNEL); + outb = kzalloc(outlen, GFP_KERNEL); if (!outb) return -ENOMEM; - context = &outb->ctx; err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, - sizeof(*outb)); + outlen); if (err) goto out; + /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ + context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc); + mlx5_state = be32_to_cpu(context->flags) >> 28; qp->state = to_ib_qp_state(mlx5_state); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index b7484e4128c8..e94a9532e218 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -277,24 +277,28 @@ void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, int index, int *is_str) { - struct mlx5_query_qp_mbox_out *out; + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); struct mlx5_qp_context *ctx; u64 param = 0; + u32 *out; int err; int no_sq; - out = kzalloc(sizeof(*out), GFP_KERNEL); + out = kzalloc(outlen, GFP_KERNEL); if (!out) return param; - err = mlx5_core_qp_query(dev, qp, out, sizeof(*out)); + err = mlx5_core_qp_query(dev, qp, out, outlen); if (err) { - mlx5_core_warn(dev, "failed to query qp\n"); + mlx5_core_warn(dev, "failed to query qp err=%d\n", err); goto out; } *is_str = 0; - ctx = &out->ctx; + + /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ + ctx = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, out, qpc); + switch (index) { case QP_PID: param = qp->pid; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index b82d65802d96..36d240c9d15f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -271,30 +271,21 @@ static void destroy_qprqsq_common(struct mlx5_core_dev *dev, int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - struct mlx5_create_qp_mbox_in *in, - int inlen) + u32 *in, int inlen) { - struct mlx5_create_qp_mbox_out out; - struct mlx5_destroy_qp_mbox_in din; - struct mlx5_destroy_qp_mbox_out dout; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {0}; + u32 dout[MLX5_ST_SZ_DW(destroy_qp_out)]; + u32 din[MLX5_ST_SZ_DW(destroy_qp_in)]; int err; - memset(&out, 0, sizeof(out)); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP); + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); - if (err) { - mlx5_core_warn(dev, "ret %d\n", err); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); + if (err) return err; - } - - if (out.hdr.status) { - mlx5_core_warn(dev, "current num of QPs 0x%x\n", - atomic_read(&dev->num_qps)); - return mlx5_cmd_status_to_err(&out.hdr); - } - qp->qpn = be32_to_cpu(out.qpn) & 0xffffff; + qp->qpn = MLX5_GET(create_qp_out, out, qpn); mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); err = create_qprqsq_common(dev, qp, MLX5_RES_QP); @@ -311,12 +302,12 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, return 0; err_cmd: - memset(&din, 0, sizeof(din)); - memset(&dout, 0, sizeof(dout)); - din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP); - din.qpn = cpu_to_be32(qp->qpn); - mlx5_cmd_exec(dev, &din, sizeof(din), &out, sizeof(dout)); - + memset(din, 0, sizeof(din)); + memset(dout, 0, sizeof(dout)); + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); + mlx5_cmd_status_to_err_v2(dout); return err; } EXPORT_SYMBOL_GPL(mlx5_core_create_qp); @@ -324,25 +315,21 @@ EXPORT_SYMBOL_GPL(mlx5_core_create_qp); int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) { - struct mlx5_destroy_qp_mbox_in in; - struct mlx5_destroy_qp_mbox_out out; + u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {0}; int err; mlx5_debug_qp_remove(dev, qp); destroy_qprqsq_common(dev, qp); - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP); - in.qpn = cpu_to_be32(qp->qpn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; - if (out.hdr.status) - return mlx5_cmd_status_to_err(&out.hdr); - atomic_dec(&dev->num_qps); return 0; } @@ -382,66 +369,44 @@ void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev) } int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - struct mlx5_query_qp_mbox_out *out, int outlen) + u32 *out, int outlen) { - struct mlx5_query_qp_mbox_in in; + u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(out, 0, outlen); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_QP); - in.qpn = cpu_to_be32(qp->qpn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); - if (err) - return err; + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); + MLX5_SET(query_qp_in, in, qpn, qp->qpn); - if (out->hdr.status) - return mlx5_cmd_status_to_err(&out->hdr); - - return err; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL_GPL(mlx5_core_qp_query); int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) { - struct mlx5_alloc_xrcd_mbox_in in; - struct mlx5_alloc_xrcd_mbox_out out; + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_XRCD); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); - else - *xrcdn = be32_to_cpu(out.xrcdn) & 0xffffff; - + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = err ? : mlx5_cmd_status_to_err_v2(out); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); return err; } EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc); int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) { - struct mlx5_dealloc_xrcd_mbox_in in; - struct mlx5_dealloc_xrcd_mbox_out out; + u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_XRCD); - in.xrcdn = cpu_to_be32(xrcdn); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); - - return err; + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); @@ -449,28 +414,26 @@ EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, u8 flags, int error) { - struct mlx5_page_fault_resume_mbox_in in; - struct mlx5_page_fault_resume_mbox_out out; + u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {0}; int err; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME); - in.hdr.opmod = 0; - flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR | - MLX5_PAGE_FAULT_RESUME_WRITE | - MLX5_PAGE_FAULT_RESUME_RDMA); - flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0); - in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) | - (flags << MLX5_QPN_BITS)); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; + MLX5_SET(page_fault_resume_in, in, opcode, + MLX5_CMD_OP_PAGE_FAULT_RESUME); - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); + MLX5_SET(page_fault_resume_in, in, qpn, qpn); - return err; + if (flags & MLX5_PAGE_FAULT_RESUME_REQUESTOR) + MLX5_SET(page_fault_resume_in, in, req_res, 1); + if (flags & MLX5_PAGE_FAULT_RESUME_WRITE) + MLX5_SET(page_fault_resume_in, in, read_write, 1); + if (flags & MLX5_PAGE_FAULT_RESUME_RDMA) + MLX5_SET(page_fault_resume_in, in, rdma, 1); + if (error) + MLX5_SET(page_fault_resume_in, in, error, 1); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return err ? : mlx5_cmd_status_to_err_v2(out); } EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); #endif @@ -541,13 +504,10 @@ EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id) { - u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)]; - u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)]; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0}; int err; - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); - MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); if (!err) @@ -559,11 +519,8 @@ EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter); int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id) { - u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)]; - u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)] = {0}; MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); @@ -576,9 +533,7 @@ EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter); int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id, int reset, void *out, int out_size) { - u32 in[MLX5_ST_SZ_DW(query_q_counter_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {0}; MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); MLX5_SET(query_q_counter_in, in, clear, reset); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2a39a06dbad4..cb94ac5b8420 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1966,7 +1966,10 @@ struct mlx5_ifc_qpc_bits { u8 reserved_at_3e0[0x8]; u8 cqn_snd[0x18]; - u8 reserved_at_400[0x40]; + u8 reserved_at_400[0x8]; + u8 deth_sqpn[0x18]; + + u8 reserved_at_420[0x20]; u8 reserved_at_440[0x8]; u8 last_acked_psn[0x18]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 7879bf411891..16e1efecaf66 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -123,12 +123,13 @@ enum { }; enum { - MLX5_NON_ZERO_RQ = 0 << 24, - MLX5_SRQ_RQ = 1 << 24, - MLX5_CRQ_RQ = 2 << 24, - MLX5_ZERO_LEN_RQ = 3 << 24 + MLX5_NON_ZERO_RQ = 0x0, + MLX5_SRQ_RQ = 0x1, + MLX5_CRQ_RQ = 0x2, + MLX5_ZERO_LEN_RQ = 0x3 }; +/* TODO REM */ enum { /* params1 */ MLX5_QP_BIT_SRE = 1 << 15, @@ -177,12 +178,6 @@ enum { MLX5_FENCE_MODE_SMALL_AND_FENCE = 4 << 5, }; -enum { - MLX5_QP_LAT_SENSITIVE = 1 << 28, - MLX5_QP_BLOCK_MCAST = 1 << 30, - MLX5_QP_ENABLE_SIG = 1 << 31, -}; - enum { MLX5_RCV_DBR = 0, MLX5_SND_DBR = 1, @@ -525,34 +520,6 @@ struct mlx5_qp_context { u8 rsvd1[24]; }; -struct mlx5_create_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_qpn; - u8 rsvd0[4]; - __be32 opt_param_mask; - u8 rsvd1[4]; - struct mlx5_qp_context ctx; - u8 rsvd3[16]; - __be64 pas[0]; -}; - -struct mlx5_create_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 qpn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - struct mlx5_modify_qp_mbox_in { struct mlx5_inbox_hdr hdr; __be32 qpn; @@ -568,56 +535,6 @@ struct mlx5_modify_qp_mbox_out { u8 rsvd0[8]; }; -struct mlx5_query_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd[4]; -}; - -struct mlx5_query_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd1[8]; - __be32 optparam; - u8 rsvd0[4]; - struct mlx5_qp_context ctx; - u8 rsvd2[16]; - __be64 pas[0]; -}; - -struct mlx5_conf_sqp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd[3]; - u8 type; -}; - -struct mlx5_conf_sqp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_alloc_xrcd_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_alloc_xrcd_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 xrcdn; - u8 rsvd[4]; -}; - -struct mlx5_dealloc_xrcd_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 xrcdn; - u8 rsvd[4]; -}; - -struct mlx5_dealloc_xrcd_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) { return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); @@ -628,20 +545,9 @@ static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, return radix_tree_lookup(&dev->priv.mkey_table.tree, key); } -struct mlx5_page_fault_resume_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 flags_qpn; - u8 reserved[4]; -}; - -struct mlx5_page_fault_resume_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - struct mlx5_create_qp_mbox_in *in, + u32 *in, int inlen); int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, struct mlx5_modify_qp_mbox_in *in, int sqd_event, @@ -649,7 +555,7 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - struct mlx5_query_qp_mbox_out *out, int outlen); + u32 *out, int outlen); int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); -- cgit v1.2.3 From 1a412fb1caa2c1b77719ccb5ed8b0c3c2bc65da7 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 19 Jul 2016 18:03:21 +0300 Subject: {net,IB}/mlx5: Modify QP commands via mlx5 ifc Prior to this patch we assumed that modify QP commands have the same layout. In ConnectX-4 for each QP transition there is a specific command and their layout can vary. e.g: 2err/2rst commands don't have QP context in their layout and before this patch we posted the QP context in those commands. Fortunately the FW only checks the suffix of the commands and executes them, while ignoring all invalid data sent after the valid command layout. This patch removes mlx5_modify_qp_mbox_in and changes mlx5_core_qp_modify to receive the required transition and QP context with opt_param_mask if needed. This way the caller is not required to provide the command inbox layout and it will be generated automatically. mlx5_core_qp_modify will generate the command inbox/outbox layouts according to the requested transition and will fill the requested parameters. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 22 ++--- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 124 +++++++++++++++++++++++++-- include/linux/mlx5/qp.h | 20 +---- 3 files changed, 124 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d22492ff863e..626173736749 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1871,7 +1871,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_ib_qp_base *base = &qp->trans_qp.base; - struct mlx5_modify_qp_mbox_in *in; unsigned long flags; int err; @@ -1884,16 +1883,12 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) - return; - if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { mlx5_ib_qp_disable_pagefaults(qp); err = mlx5_core_qp_modify(dev->mdev, - MLX5_CMD_OP_2RST_QP, in, 0, - &base->mqp); + MLX5_CMD_OP_2RST_QP, 0, + NULL, &base->mqp); } else { err = modify_raw_packet_qp(dev, qp, MLX5_CMD_OP_2RST_QP); @@ -1935,8 +1930,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) base->mqp.qpn); } - kfree(in); - if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) @@ -2522,7 +2515,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; - struct mlx5_modify_qp_mbox_in *in; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; enum mlx5_qp_optpar optpar; @@ -2531,11 +2523,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, int err; u16 op; - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) return -ENOMEM; - context = &in->ctx; err = to_mlx5_st(ibqp->qp_type); if (err < 0) { mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); @@ -2700,12 +2691,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, op = optab[mlx5_cur][mlx5_new]; optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; - in->optparam = cpu_to_be32(optpar); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) err = modify_raw_packet_qp(dev, qp, op); else - err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, + err = mlx5_core_qp_modify(dev->mdev, op, optpar, context, &base->mqp); if (err) goto out; @@ -2746,7 +2736,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } out: - kfree(in); + kfree(context); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 36d240c9d15f..50875a4e9d58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -335,21 +335,127 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, - struct mlx5_modify_qp_mbox_in *in, int sqd_event, +struct mbox_info { + u32 *in; + u32 *out; + int inlen; + int outlen; +}; + +static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen) +{ + mbox->inlen = inlen; + mbox->outlen = outlen; + mbox->in = kzalloc(mbox->inlen, GFP_KERNEL); + mbox->out = kzalloc(mbox->outlen, GFP_KERNEL); + if (!mbox->in || !mbox->out) { + kfree(mbox->in); + kfree(mbox->out); + return -ENOMEM; + } + + return 0; +} + +static void mbox_free(struct mbox_info *mbox) +{ + kfree(mbox->in); + kfree(mbox->out); +} + +static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, + u32 opt_param_mask, void *qpc, + struct mbox_info *mbox) +{ + mbox->out = NULL; + mbox->in = NULL; + +#define MBOX_ALLOC(mbox, typ) \ + mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out)) + +#define MOD_QP_IN_SET(typ, in, _opcode, _qpn) \ + MLX5_SET(typ##_in, in, opcode, _opcode); \ + MLX5_SET(typ##_in, in, qpn, _qpn) + +#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc) \ + MOD_QP_IN_SET(typ, in, _opcode, _qpn); \ + MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \ + memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, MLX5_ST_SZ_BYTES(qpc)) + + switch (opcode) { + /* 2RST & 2ERR */ + case MLX5_CMD_OP_2RST_QP: + if (MBOX_ALLOC(mbox, qp_2rst)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn); + break; + case MLX5_CMD_OP_2ERR_QP: + if (MBOX_ALLOC(mbox, qp_2err)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn); + break; + + /* MODIFY with QPC */ + case MLX5_CMD_OP_RST2INIT_QP: + if (MBOX_ALLOC(mbox, rst2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + if (MBOX_ALLOC(mbox, init2rtr_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + if (MBOX_ALLOC(mbox, rtr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + if (MBOX_ALLOC(mbox, rts2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + if (MBOX_ALLOC(mbox, sqerr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc); + break; + case MLX5_CMD_OP_INIT2INIT_QP: + if (MBOX_ALLOC(mbox, init2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc); + break; + default: + mlx5_core_err(dev, "Unknown transition for modify QP: OP(0x%x) QPN(0x%x)\n", + opcode, qpn); + return -EINVAL; + } + return 0; +} + +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, + u32 opt_param_mask, void *qpc, struct mlx5_core_qp *qp) { - struct mlx5_modify_qp_mbox_out out; - int err = 0; + struct mbox_info mbox; + int err; - memset(&out, 0, sizeof(out)); - in->hdr.opcode = cpu_to_be16(operation); - in->qpn = cpu_to_be32(qp->qpn); - err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); + err = modify_qp_mbox_alloc(dev, opcode, qp->qpn, + opt_param_mask, qpc, &mbox); if (err) return err; - return mlx5_cmd_status_to_err(&out.hdr); + err = mlx5_cmd_exec(dev, mbox.in, mbox.inlen, mbox.out, mbox.outlen); + err = err ? : mlx5_cmd_status_to_err_v2(mbox.out); + mbox_free(&mbox); + return err; } EXPORT_SYMBOL_GPL(mlx5_core_qp_modify); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 16e1efecaf66..0aacb2a7480d 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -479,6 +479,7 @@ struct mlx5_qp_path { u8 rmac[6]; }; +/* FIXME: use mlx5_ifc.h qpc */ struct mlx5_qp_context { __be32 flags; __be32 flags_pd; @@ -520,21 +521,6 @@ struct mlx5_qp_context { u8 rsvd1[24]; }; -struct mlx5_modify_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd0[4]; - __be32 optparam; - u8 rsvd1[4]; - struct mlx5_qp_context ctx; - u8 rsvd2[16]; -}; - -struct mlx5_modify_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) { return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); @@ -549,8 +535,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, int inlen); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, - struct mlx5_modify_qp_mbox_in *in, int sqd_event, +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, + u32 opt_param_mask, void *qpc, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); -- cgit v1.2.3 From c4f287c4a6ac489c18afc4acc4353141a8c53070 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 19 Jul 2016 20:17:12 +0300 Subject: net/mlx5: Unify and improve command interface Now as all commands use mlx5 ifc interface, instead of doing two calls for executing a command we embed command status checking into mlx5_cmd_exec to simplify the interface. Also we do here some cleanup for redundant software structures (inbox/outbox) and functions and improved command failure output. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 10 +- drivers/infiniband/hw/mlx5/qp.c | 5 +- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 251 +++++++++++---------- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 16 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 12 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 72 ++---- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 130 +++-------- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 15 +- drivers/net/ethernet/mellanox/mlx5/core/mad.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/main.c | 80 ++----- drivers/net/ethernet/mellanox/mlx5/core/mcg.c | 10 +- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 13 -- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 28 +-- .../net/ethernet/mellanox/mlx5/core/pagealloc.c | 11 +- drivers/net/ethernet/mellanox/mlx5/core/pd.c | 11 +- drivers/net/ethernet/mellanox/mlx5/core/port.c | 99 ++------ drivers/net/ethernet/mellanox/mlx5/core/qp.c | 26 +-- drivers/net/ethernet/mellanox/mlx5/core/rl.c | 11 +- drivers/net/ethernet/mellanox/mlx5/core/srq.c | 49 ++-- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 183 +++++---------- drivers/net/ethernet/mellanox/mlx5/core/uar.c | 11 +- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 74 ++---- drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 29 +-- include/linux/mlx5/device.h | 115 ---------- include/linux/mlx5/driver.h | 7 +- 27 files changed, 385 insertions(+), 897 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a84bb766fc62..6fb77d791045 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -233,23 +233,19 @@ static int set_roce_addr(struct ib_device *device, u8 port_num, const union ib_gid *gid, const struct ib_gid_attr *attr) { - struct mlx5_ib_dev *dev = to_mdev(device); - u32 in[MLX5_ST_SZ_DW(set_roce_address_in)]; - u32 out[MLX5_ST_SZ_DW(set_roce_address_out)]; + struct mlx5_ib_dev *dev = to_mdev(device); + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0}; void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); if (ll != IB_LINK_LAYER_ETHERNET) return -EINVAL; - memset(in, 0, sizeof(in)); - ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); - - memset(out, 0, sizeof(out)); return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 626173736749..f3c943f6458e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1007,13 +1007,10 @@ static int is_connected(enum ib_qp_type qp_type) static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq, u32 tdn) { - u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); - memset(in, 0, sizeof(in)); - MLX5_SET(tisc, tisc, transport_domain, tdn); - return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 88b05400fbc8..23b95daa9533 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -554,11 +554,124 @@ const char *mlx5_command_str(int command) } } +static const char *cmd_status_str(u8 status) +{ + switch (status) { + case MLX5_CMD_STAT_OK: + return "OK"; + case MLX5_CMD_STAT_INT_ERR: + return "internal error"; + case MLX5_CMD_STAT_BAD_OP_ERR: + return "bad operation"; + case MLX5_CMD_STAT_BAD_PARAM_ERR: + return "bad parameter"; + case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: + return "bad system state"; + case MLX5_CMD_STAT_BAD_RES_ERR: + return "bad resource"; + case MLX5_CMD_STAT_RES_BUSY: + return "resource busy"; + case MLX5_CMD_STAT_LIM_ERR: + return "limits exceeded"; + case MLX5_CMD_STAT_BAD_RES_STATE_ERR: + return "bad resource state"; + case MLX5_CMD_STAT_IX_ERR: + return "bad index"; + case MLX5_CMD_STAT_NO_RES_ERR: + return "no resources"; + case MLX5_CMD_STAT_BAD_INP_LEN_ERR: + return "bad input length"; + case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: + return "bad output length"; + case MLX5_CMD_STAT_BAD_QP_STATE_ERR: + return "bad QP state"; + case MLX5_CMD_STAT_BAD_PKT_ERR: + return "bad packet (discarded)"; + case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: + return "bad size too many outstanding CQEs"; + default: + return "unknown status"; + } +} + +static int cmd_status_to_err(u8 status) +{ + switch (status) { + case MLX5_CMD_STAT_OK: return 0; + case MLX5_CMD_STAT_INT_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_OP_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_PARAM_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_RES_ERR: return -EINVAL; + case MLX5_CMD_STAT_RES_BUSY: return -EBUSY; + case MLX5_CMD_STAT_LIM_ERR: return -ENOMEM; + case MLX5_CMD_STAT_BAD_RES_STATE_ERR: return -EINVAL; + case MLX5_CMD_STAT_IX_ERR: return -EINVAL; + case MLX5_CMD_STAT_NO_RES_ERR: return -EAGAIN; + case MLX5_CMD_STAT_BAD_INP_LEN_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_QP_STATE_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_PKT_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: return -EINVAL; + default: return -EIO; + } +} + +struct mlx5_ifc_mbox_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_mbox_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome) +{ + *status = MLX5_GET(mbox_out, out, status); + *syndrome = MLX5_GET(mbox_out, out, syndrome); +} + +static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out) +{ + u32 syndrome; + u8 status; + u16 opcode; + u16 op_mod; + + mlx5_cmd_mbox_status(out, &status, &syndrome); + if (!status) + return 0; + + opcode = MLX5_GET(mbox_in, in, opcode); + op_mod = MLX5_GET(mbox_in, in, op_mod); + + mlx5_core_err(dev, + "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n", + mlx5_command_str(opcode), + opcode, op_mod, + cmd_status_str(status), + status, + syndrome); + + return cmd_status_to_err(status); +} + static void dump_command(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent, int input) { - u16 op = be16_to_cpu(((struct mlx5_inbox_hdr *)(ent->lay->in))->opcode); struct mlx5_cmd_msg *msg = input ? ent->in : ent->out; + u16 op = MLX5_GET(mbox_in, ent->lay->in, opcode); struct mlx5_cmd_mailbox *next = msg->next; int data_only; u32 offset = 0; @@ -608,9 +721,7 @@ static void dump_command(struct mlx5_core_dev *dev, static u16 msg_to_opcode(struct mlx5_cmd_msg *in) { - struct mlx5_inbox_hdr *hdr = (struct mlx5_inbox_hdr *)(in->first.data); - - return be16_to_cpu(hdr->opcode); + return MLX5_GET(mbox_in, in->first.data, opcode); } static void cb_timeout_handler(struct work_struct *work) @@ -749,16 +860,6 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) return err; } -static __be32 *get_synd_ptr(struct mlx5_outbox_hdr *out) -{ - return &out->syndrome; -} - -static u8 *get_status_ptr(struct mlx5_outbox_hdr *out) -{ - return &out->status; -} - /* Notes: * 1. Callback functions may not sleep * 2. page queue commands do not support asynchrous completion @@ -804,7 +905,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, goto out_free; ds = ent->ts2 - ent->ts1; - op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode); + op = MLX5_GET(mbox_in, in->first.data, opcode); if (op < ARRAY_SIZE(cmd->stats)) { stats = &cmd->stats[op]; spin_lock_irq(&stats->lock); @@ -1305,7 +1406,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) err = mlx5_copy_from_msg(ent->uout, ent->out, ent->uout_size); - err = err ? err : mlx5_cmd_status_to_err_v2(ent->uout); + + err = err ? err : mlx5_cmd_check(dev, + ent->in->first.data, + ent->uout); } mlx5_free_cmd_msg(dev, ent->out); @@ -1359,14 +1463,9 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size, return msg; } -static u16 opcode_from_in(struct mlx5_inbox_hdr *in) -{ - return be16_to_cpu(in->opcode); -} - -static int is_manage_pages(struct mlx5_inbox_hdr *in) +static int is_manage_pages(void *in) { - return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES; + return MLX5_GET(mbox_in, in, opcode) == MLX5_CMD_OP_MANAGE_PAGES; } static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, @@ -1382,9 +1481,11 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, if (pci_channel_offline(dev->pdev) || dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - err = mlx5_internal_err_ret_value(dev, opcode_from_in(in), &drv_synd, &status); - *get_synd_ptr(out) = cpu_to_be32(drv_synd); - *get_status_ptr(out) = status; + u16 opcode = MLX5_GET(mbox_in, in, opcode); + + err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); + MLX5_SET(mbox_out, out, status, status); + MLX5_SET(mbox_out, out, syndrome, drv_synd); return err; } @@ -1436,7 +1537,10 @@ out_in: int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) { - return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL); + int err; + + err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL); + return err ? : mlx5_cmd_check(dev, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec); @@ -1673,96 +1777,3 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) pci_pool_destroy(cmd->pool); } EXPORT_SYMBOL(mlx5_cmd_cleanup); - -static const char *cmd_status_str(u8 status) -{ - switch (status) { - case MLX5_CMD_STAT_OK: - return "OK"; - case MLX5_CMD_STAT_INT_ERR: - return "internal error"; - case MLX5_CMD_STAT_BAD_OP_ERR: - return "bad operation"; - case MLX5_CMD_STAT_BAD_PARAM_ERR: - return "bad parameter"; - case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: - return "bad system state"; - case MLX5_CMD_STAT_BAD_RES_ERR: - return "bad resource"; - case MLX5_CMD_STAT_RES_BUSY: - return "resource busy"; - case MLX5_CMD_STAT_LIM_ERR: - return "limits exceeded"; - case MLX5_CMD_STAT_BAD_RES_STATE_ERR: - return "bad resource state"; - case MLX5_CMD_STAT_IX_ERR: - return "bad index"; - case MLX5_CMD_STAT_NO_RES_ERR: - return "no resources"; - case MLX5_CMD_STAT_BAD_INP_LEN_ERR: - return "bad input length"; - case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: - return "bad output length"; - case MLX5_CMD_STAT_BAD_QP_STATE_ERR: - return "bad QP state"; - case MLX5_CMD_STAT_BAD_PKT_ERR: - return "bad packet (discarded)"; - case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: - return "bad size too many outstanding CQEs"; - default: - return "unknown status"; - } -} - -static int cmd_status_to_err(u8 status) -{ - switch (status) { - case MLX5_CMD_STAT_OK: return 0; - case MLX5_CMD_STAT_INT_ERR: return -EIO; - case MLX5_CMD_STAT_BAD_OP_ERR: return -EINVAL; - case MLX5_CMD_STAT_BAD_PARAM_ERR: return -EINVAL; - case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: return -EIO; - case MLX5_CMD_STAT_BAD_RES_ERR: return -EINVAL; - case MLX5_CMD_STAT_RES_BUSY: return -EBUSY; - case MLX5_CMD_STAT_LIM_ERR: return -ENOMEM; - case MLX5_CMD_STAT_BAD_RES_STATE_ERR: return -EINVAL; - case MLX5_CMD_STAT_IX_ERR: return -EINVAL; - case MLX5_CMD_STAT_NO_RES_ERR: return -EAGAIN; - case MLX5_CMD_STAT_BAD_INP_LEN_ERR: return -EIO; - case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: return -EIO; - case MLX5_CMD_STAT_BAD_QP_STATE_ERR: return -EINVAL; - case MLX5_CMD_STAT_BAD_PKT_ERR: return -EINVAL; - case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: return -EINVAL; - default: return -EIO; - } -} - -/* this will be available till all the commands use set/get macros */ -int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) -{ - if (!hdr->status) - return 0; - - pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", - cmd_status_str(hdr->status), hdr->status, - be32_to_cpu(hdr->syndrome)); - - return cmd_status_to_err(hdr->status); -} - -int mlx5_cmd_status_to_err_v2(void *ptr) -{ - u32 syndrome; - u8 status; - - status = be32_to_cpu(*(__be32 *)ptr) >> 24; - if (!status) - return 0; - - syndrome = be32_to_cpu(*(__be32 *)(ptr + 4)); - - pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", - cmd_status_str(status), status, syndrome); - - return cmd_status_to_err(status); -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index cf02d8a27874..32d4af9b594d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -153,7 +153,6 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, memset(out, 0, sizeof(out)); MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; @@ -187,9 +186,8 @@ err_cmd: memset(dout, 0, sizeof(dout)); MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); MLX5_SET(destroy_cq_in, din, cqn, cq->cqn); - err = mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); - return err ? : mlx5_cmd_status_to_err_v2(out); - + mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); + return err; } EXPORT_SYMBOL(mlx5_core_create_cq); @@ -216,7 +214,6 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ); MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; @@ -235,13 +232,10 @@ int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *out, int outlen) { u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {0}; - int err; MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ); MLX5_SET(query_cq_in, in, cqn, cq->cqn); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } EXPORT_SYMBOL(mlx5_core_query_cq); @@ -249,11 +243,9 @@ int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen) { u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0}; - int err; MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ); - err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_modify_cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 4a3757e60441..9561fca494bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -726,7 +726,7 @@ static int mlx5e_get_link_ksettings(struct net_device *netdev, { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; - u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0}; u32 eth_proto_cap; u32 eth_proto_admin; u32 eth_proto_lp; @@ -736,7 +736,6 @@ static int mlx5e_get_link_ksettings(struct net_device *netdev, int err; err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); - if (err) { netdev_err(netdev, "%s: query port ptys failed: %d\n", __func__, err); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 98fb0d8ef2d2..10fa12aa063f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -180,18 +180,15 @@ static void mlx5e_update_vport_counters(struct mlx5e_priv *priv) { int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); u32 *out = (u32 *)priv->stats.vport.query_vport_out; - u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0}; struct mlx5_core_dev *mdev = priv->mdev; - memset(in, 0, sizeof(in)); - MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); MLX5_SET(query_vport_counter_in, in, op_mod, 0); MLX5_SET(query_vport_counter_in, in, other_vport, 0); memset(out, 0, outlen); - mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); } @@ -2022,14 +2019,11 @@ static void mlx5e_close_drop_rq(struct mlx5e_priv *priv) static int mlx5e_create_tis(struct mlx5e_priv *priv, int tc) { struct mlx5_core_dev *mdev = priv->mdev; - u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); - memset(in, 0, sizeof(in)); - MLX5_SET(tisc, tisc, prio, tc << 1); MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn); - return mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 71411976ef1c..aaca09002ca6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -88,14 +88,10 @@ static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn) { u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0}; u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {0}; - int err; MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); MLX5_SET(destroy_eq_in, in, eq_number, eqn); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); - + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry) @@ -383,7 +379,6 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) goto err_in; @@ -547,12 +542,9 @@ int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u32 *out, int outlen) { u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {0}; - int err; MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ); MLX5_SET(query_eq_in, in, eq_number, eq->eqn); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } EXPORT_SYMBOL_GPL(mlx5_core_eq_query); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index f6d667797ee1..0a364bf2cd71 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -87,13 +87,9 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports); static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, u32 events_mask) { - int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)]; - int out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)]; + int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0}; + int out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0}; void *nic_vport_ctx; - int err; - - memset(out, 0, sizeof(out)); - memset(in, 0, sizeof(in)); MLX5_SET(modify_nic_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); @@ -116,45 +112,31 @@ static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, MLX5_SET(nic_vport_context, nic_vport_ctx, event_on_promisc_change, 1); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (err) - goto ex; - err = mlx5_cmd_status_to_err_v2(out); - if (err) - goto ex; - return 0; -ex: - return err; + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } /* E-Switch vport context HW commands */ static int query_esw_vport_context_cmd(struct mlx5_core_dev *mdev, u32 vport, u32 *out, int outlen) { - u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {0}; MLX5_SET(query_nic_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); - MLX5_SET(query_esw_vport_context_in, in, vport_number, vport); if (vport) MLX5_SET(query_esw_vport_context_in, in, other_vport, 1); - - return mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); } static int query_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport, u16 *vlan, u8 *qos) { - u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)]; + u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {0}; int err; bool cvlan_strip; bool cvlan_insert; - memset(out, 0, sizeof(out)); - *vlan = 0; *qos = 0; @@ -188,27 +170,20 @@ out: static int modify_esw_vport_context_cmd(struct mlx5_core_dev *dev, u16 vport, void *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_esw_vport_context_out)]; - - memset(out, 0, sizeof(out)); + u32 out[MLX5_ST_SZ_DW(modify_esw_vport_context_out)] = {0}; + MLX5_SET(modify_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT); MLX5_SET(modify_esw_vport_context_in, in, vport_number, vport); if (vport) MLX5_SET(modify_esw_vport_context_in, in, other_vport, 1); - - MLX5_SET(modify_esw_vport_context_in, in, opcode, - MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT); - - return mlx5_cmd_exec_check_status(dev, in, inlen, - out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport, u16 vlan, u8 qos, bool set) { - u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {0}; if (!MLX5_CAP_ESW(dev, vport_cvlan_strip) || !MLX5_CAP_ESW(dev, vport_cvlan_insert_if_not_exist)) @@ -216,7 +191,6 @@ static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport, esw_debug(dev, "Set Vport[%d] VLAN %d qos %d set=%d\n", vport, vlan, qos, set); - if (set) { MLX5_SET(modify_esw_vport_context_in, in, esw_vport_context.vport_cvlan_strip, 1); @@ -241,13 +215,10 @@ static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport, static int set_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac, u8 vlan_valid, u16 vlan) { - u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)]; - u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)]; + u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)] = {0}; u8 *in_mac_addr; - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); - MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY); MLX5_SET(set_l2_table_entry_in, in, table_index, index); @@ -257,23 +228,18 @@ static int set_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index, in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address); ether_addr_copy(&in_mac_addr[2], mac); - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), - out, sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } static int del_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index) { - u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)]; - u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)] = {0}; MLX5_SET(delete_l2_table_entry_in, in, opcode, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); MLX5_SET(delete_l2_table_entry_in, in, table_index, index); - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), - out, sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } static int alloc_l2_table_index(struct mlx5_l2_table *l2_table, u32 *ix) @@ -1903,7 +1869,7 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, struct ifla_vf_stats *vf_stats) { int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); - u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0}; int err = 0; u32 *out; @@ -1916,8 +1882,6 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, if (!out) return -ENOMEM; - memset(in, 0, sizeof(in)); - MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); MLX5_SET(query_vport_counter_in, in, op_mod, 0); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 9134010e2921..e64499ebf2b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -41,10 +41,8 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft) { - u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)]; - u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0}; MLX5_SET(set_flow_table_root_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ROOT); @@ -55,9 +53,7 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev, MLX5_SET(set_flow_table_root_in, in, other_vport, 1); } - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, @@ -66,12 +62,10 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, unsigned int log_size, struct mlx5_flow_table *next_ft, unsigned int *table_id) { - u32 out[MLX5_ST_SZ_DW(create_flow_table_out)]; - u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]; + u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {0}; int err; - memset(in, 0, sizeof(in)); - MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); @@ -87,10 +81,7 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, MLX5_SET(create_flow_table_in, in, other_vport, 1); } - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); - + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); if (!err) *table_id = MLX5_GET(create_flow_table_out, out, table_id); @@ -100,11 +91,8 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft) { - u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)] = {0}; MLX5_SET(destroy_flow_table_in, in, opcode, MLX5_CMD_OP_DESTROY_FLOW_TABLE); @@ -115,19 +103,15 @@ int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev, MLX5_SET(destroy_flow_table_in, in, other_vport, 1); } - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, struct mlx5_flow_table *next_ft) { - u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)]; - u32 out[MLX5_ST_SZ_DW(modify_flow_table_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_flow_table_out)] = {0}; MLX5_SET(modify_flow_table_in, in, opcode, MLX5_CMD_OP_MODIFY_FLOW_TABLE); @@ -146,8 +130,7 @@ int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev, MLX5_SET(modify_flow_table_in, in, table_miss_mode, 0); } - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev, @@ -155,12 +138,10 @@ int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev, u32 *in, unsigned int *group_id) { + u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {0}; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - u32 out[MLX5_ST_SZ_DW(create_flow_group_out)]; int err; - memset(out, 0, sizeof(out)); - MLX5_SET(create_flow_group_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_GROUP); MLX5_SET(create_flow_group_in, in, table_type, ft->type); @@ -170,13 +151,10 @@ int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev, MLX5_SET(create_flow_group_in, in, other_vport, 1); } - err = mlx5_cmd_exec_check_status(dev, in, - inlen, out, - sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *group_id = MLX5_GET(create_flow_group_out, out, group_id); - return err; } @@ -184,11 +162,8 @@ int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, unsigned int group_id) { - u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)]; - u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)] = {0}; MLX5_SET(destroy_flow_group_in, in, opcode, MLX5_CMD_OP_DESTROY_FLOW_GROUP); @@ -200,8 +175,7 @@ int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev, MLX5_SET(destroy_flow_group_in, in, other_vport, 1); } - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, @@ -212,7 +186,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, { unsigned int inlen = MLX5_ST_SZ_BYTES(set_fte_in) + fte->dests_size * MLX5_ST_SZ_BYTES(dest_format_struct); - u32 out[MLX5_ST_SZ_DW(set_fte_out)]; + u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {0}; struct mlx5_flow_rule *dst; void *in_flow_context; void *in_match_value; @@ -290,11 +264,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, list_size); } - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, - sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); kvfree(in); - return err; } @@ -303,7 +274,7 @@ int mlx5_cmd_create_fte(struct mlx5_core_dev *dev, unsigned group_id, struct fs_fte *fte) { - return mlx5_cmd_set_fte(dev, 0, 0, ft, group_id, fte); + return mlx5_cmd_set_fte(dev, 0, 0, ft, group_id, fte); } int mlx5_cmd_update_fte(struct mlx5_core_dev *dev, @@ -327,12 +298,8 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, unsigned int index) { - u32 out[MLX5_ST_SZ_DW(delete_fte_out)]; - u32 in[MLX5_ST_SZ_DW(delete_fte_in)]; - int err; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 out[MLX5_ST_SZ_DW(delete_fte_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(delete_fte_in)] = {0}; MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); MLX5_SET(delete_fte_in, in, table_type, ft->type); @@ -343,74 +310,55 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev, MLX5_SET(delete_fte_in, in, other_vport, 1); } - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); - - return err; + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id) { - u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)]; - u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)]; + u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0}; int err; - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); - MLX5_SET(alloc_flow_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_FLOW_COUNTER); - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); - if (err) - return err; - - *id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id); - - return 0; + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id); + return err; } int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id) { - u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)]; - u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)] = {0}; MLX5_SET(dealloc_flow_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id, u64 *packets, u64 *bytes) { u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) + - MLX5_ST_SZ_BYTES(traffic_counter)]; - u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)]; + MLX5_ST_SZ_BYTES(traffic_counter)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0}; void *stats; int err = 0; - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); - MLX5_SET(query_flow_counter_in, in, opcode, MLX5_CMD_OP_QUERY_FLOW_COUNTER); MLX5_SET(query_flow_counter_in, in, op_mod, 0); MLX5_SET(query_flow_counter_in, in, flow_counter_id, id); - - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); if (err) return err; stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics); *packets = MLX5_GET64(traffic_counter, stats, packets); *bytes = MLX5_GET64(traffic_counter, stats, octets); - return 0; } @@ -448,18 +396,14 @@ void mlx5_cmd_fc_bulk_free(struct mlx5_cmd_fc_bulk *b) int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b) { - u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0}; MLX5_SET(query_flow_counter_in, in, opcode, MLX5_CMD_OP_QUERY_FLOW_COUNTER); MLX5_SET(query_flow_counter_in, in, op_mod, 0); MLX5_SET(query_flow_counter_in, in, flow_counter_id, b->id); MLX5_SET(query_flow_counter_in, in, num_of_counters, b->num); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), - b->out, b->outlen); + return mlx5_cmd_exec(dev, in, sizeof(in), b->out, b->outlen); } void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 56bf520d1429..5718aada6605 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -38,13 +38,10 @@ static int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out, int outlen) { - u32 in[MLX5_ST_SZ_DW(query_adapter_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {0}; MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } int mlx5_query_board_id(struct mlx5_core_dev *dev) @@ -164,20 +161,16 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) { u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0}; u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {0}; - int err; MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) { u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0}; u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0}; - int err; MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c index 13e6afd52a9b..3a3b0005fd2b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mad.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c @@ -60,7 +60,6 @@ int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); err = mlx5_cmd_exec(dev, in, inlen, out, outlen); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 4f491d43e77d..7fd662236061 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -363,10 +363,6 @@ static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, op_mod, opmod); err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); - if (err) - goto query_ex; - - err = mlx5_cmd_status_to_err_v2(out); if (err) { mlx5_core_warn(dev, "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n", @@ -409,20 +405,11 @@ int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type) static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod) { - u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; - int err; - - memset(out, 0, sizeof(out)); + u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)] = {0}; MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1); - err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); - if (err) - return err; - - err = mlx5_cmd_status_to_err_v2(out); - - return err; + return mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); } static int handle_hca_cap_atomic(struct mlx5_core_dev *dev) @@ -528,37 +515,22 @@ static int set_hca_ctrl(struct mlx5_core_dev *dev) int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id) { - u32 out[MLX5_ST_SZ_DW(enable_hca_out)]; - u32 in[MLX5_ST_SZ_DW(enable_hca_in)]; - int err; + u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {0}; - memset(in, 0, sizeof(in)); MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); MLX5_SET(enable_hca_in, in, function_id, func_id); - memset(out, 0, sizeof(out)); - - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - return mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); } int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id) { - u32 out[MLX5_ST_SZ_DW(disable_hca_out)]; - u32 in[MLX5_ST_SZ_DW(disable_hca_in)]; - int err; + u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {0}; - memset(in, 0, sizeof(in)); MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); MLX5_SET(disable_hca_in, in, function_id, func_id); - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (err) - return err; - - return mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev) @@ -758,44 +730,40 @@ clean: static int mlx5_core_set_issi(struct mlx5_core_dev *dev) { - u32 query_in[MLX5_ST_SZ_DW(query_issi_in)]; - u32 query_out[MLX5_ST_SZ_DW(query_issi_out)]; - u32 set_in[MLX5_ST_SZ_DW(set_issi_in)]; - u32 set_out[MLX5_ST_SZ_DW(set_issi_out)]; - int err; + u32 query_in[MLX5_ST_SZ_DW(query_issi_in)] = {0}; + u32 query_out[MLX5_ST_SZ_DW(query_issi_out)] = {0}; u32 sup_issi; - - memset(query_in, 0, sizeof(query_in)); - memset(query_out, 0, sizeof(query_out)); + int err; MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI); - - err = mlx5_cmd_exec_check_status(dev, query_in, sizeof(query_in), - query_out, sizeof(query_out)); + err = mlx5_cmd_exec(dev, query_in, sizeof(query_in), + query_out, sizeof(query_out)); if (err) { - if (((struct mlx5_outbox_hdr *)query_out)->status == - MLX5_CMD_STAT_BAD_OP_ERR) { + u32 syndrome; + u8 status; + + mlx5_cmd_mbox_status(query_out, &status, &syndrome); + if (status == MLX5_CMD_STAT_BAD_OP_ERR) { pr_debug("Only ISSI 0 is supported\n"); return 0; } - pr_err("failed to query ISSI\n"); + pr_err("failed to query ISSI err(%d)\n", err); return err; } sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0); if (sup_issi & (1 << 1)) { - memset(set_in, 0, sizeof(set_in)); - memset(set_out, 0, sizeof(set_out)); + u32 set_in[MLX5_ST_SZ_DW(set_issi_in)] = {0}; + u32 set_out[MLX5_ST_SZ_DW(set_issi_out)] = {0}; MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI); MLX5_SET(set_issi_in, set_in, current_issi, 1); - - err = mlx5_cmd_exec_check_status(dev, set_in, sizeof(set_in), - set_out, sizeof(set_out)); + err = mlx5_cmd_exec(dev, set_in, sizeof(set_in), + set_out, sizeof(set_out)); if (err) { - pr_err("failed to set ISSI=1\n"); + pr_err("failed to set ISSI=1 err(%d)\n", err); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c index 01a1abd88203..ba2b09cc192f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c @@ -42,15 +42,12 @@ int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {0}; u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {0}; void *gid; - int err; MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); MLX5_SET(attach_to_mcg_in, in, qpn, qpn); gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); memcpy(gid, mgid, sizeof(*mgid)); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_attach_mcg); @@ -59,14 +56,11 @@ int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {0}; u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {0}; void *gid; - int err; MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); MLX5_SET(detach_from_mcg_in, in, qpn, qpn); gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); memcpy(gid, mgid, sizeof(*mgid)); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_detach_mcg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 2f86ec6fcf25..14242f17a309 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -75,19 +75,6 @@ enum { MLX5_CMD_TIME, /* print command execution time */ }; -static inline int mlx5_cmd_exec_check_status(struct mlx5_core_dev *dev, u32 *in, - int in_size, u32 *out, - int out_size) -{ - int err; - - err = mlx5_cmd_exec(dev, in, in_size, out, out_size); - if (err) - return err; - - return mlx5_cmd_status_to_err((struct mlx5_outbox_hdr *)out); -} - int mlx5_query_hca_caps(struct mlx5_core_dev *dev); int mlx5_query_board_id(struct mlx5_core_dev *dev); int mlx5_cmd_init_hca(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 0a7b743caafe..b9736f505bdf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -75,7 +75,6 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, callback, context); err = mlx5_cmd_exec(dev, in, inlen, lout, sizeof(lout)); - err = err ? : mlx5_cmd_status_to_err_v2(lout); if (err) return err; @@ -119,7 +118,6 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0}; struct mlx5_core_mkey *deleted_mkey; unsigned long flags; - int err; write_lock_irqsave(&table->lock, flags); deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key)); @@ -132,9 +130,7 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_mkey); @@ -142,14 +138,11 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, u32 *out, int outlen) { u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {0}; - int err; memset(out, 0, outlen); MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY); MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } EXPORT_SYMBOL(mlx5_core_query_mkey); @@ -163,11 +156,9 @@ int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_ MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); - if (err) - return err; - - *mkey = MLX5_GET(query_special_contexts_out, out, dump_fill_mkey); + if (!err) + *mkey = MLX5_GET(query_special_contexts_out, out, + dump_fill_mkey); return err; } EXPORT_SYMBOL(mlx5_core_dump_fill_mkey); @@ -197,11 +188,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, MLX5_SET(create_psv_in, in, num_psv, npsvs); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); - if (err) { - mlx5_core_err(dev, "create_psv cmd exec failed %d\n", err); + if (err) return err; - } for (i = 0; i < npsvs; i++) sig_index[i] = mlx5_get_psv(out, i); @@ -214,11 +202,9 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) { u32 out[MLX5_ST_SZ_DW(destroy_psv_out)] = {0}; u32 in[MLX5_ST_SZ_DW(destroy_psv_in)] = {0}; - int err; MLX5_SET(destroy_psv_in, in, opcode, MLX5_CMD_OP_DESTROY_PSV); MLX5_SET(destroy_psv_in, in, psvn, psv_num); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_psv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 7bfac21fbfee..673a7c96479a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -144,7 +144,6 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; @@ -252,8 +251,8 @@ static void page_notify_fail(struct mlx5_core_dev *dev, u16 func_id) MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_CANT_GIVE); MLX5_SET(manage_pages_in, in, function_id, func_id); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) mlx5_core_warn(dev, "page notify failed func_id(%d) err(%d)\n", func_id, err); @@ -297,7 +296,6 @@ retry: MLX5_SET(manage_pages_in, in, input_num_entries, npages); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) { mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err); @@ -331,11 +329,8 @@ static int reclaim_pages_cmd(struct mlx5_core_dev *dev, u32 npages; u32 i = 0; - if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { - int err = mlx5_cmd_exec(dev, in, in_size, out, out_size); - - return err ? : mlx5_cmd_status_to_err_v2(out); - } + if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + return mlx5_cmd_exec(dev, in, in_size, out, out_size); /* No hard feelings, we want our pages back! */ npages = MLX5_GET(manage_pages_in, in, input_num_entries); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c index efe452c30a8d..bd830d8d6c5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c @@ -44,11 +44,8 @@ int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn) MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); - if (err) - return err; - - *pdn = MLX5_GET(alloc_pd_out, out, pd); + if (!err) + *pdn = MLX5_GET(alloc_pd_out, out, pd); return err; } EXPORT_SYMBOL(mlx5_core_alloc_pd); @@ -57,11 +54,9 @@ int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn) { u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {0}; u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {0}; - int err; MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); MLX5_SET(dealloc_pd_in, in, pd, pdn); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_dealloc_pd); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index e8324c2a8fc3..2ee28ad8f918 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -61,7 +61,6 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, MLX5_SET(access_register_in, in, register_id, reg_id); err = mlx5_cmd_exec(dev, in, inlen, out, outlen); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) goto out; @@ -102,12 +101,10 @@ EXPORT_SYMBOL_GPL(mlx5_set_port_caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, int ptys_size, int proto_mask, u8 local_port) { - u32 in[MLX5_ST_SZ_DW(ptys_reg)]; + u32 in[MLX5_ST_SZ_DW(ptys_reg)] = {0}; - memset(in, 0, sizeof(in)); MLX5_SET(ptys_reg, in, local_port, local_port); MLX5_SET(ptys_reg, in, proto_mask, proto_mask); - return mlx5_core_access_reg(dev, in, sizeof(in), ptys, ptys_size, MLX5_REG_PTYS, 0, 0); } @@ -115,13 +112,11 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_ptys); int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration) { + u32 in[MLX5_ST_SZ_DW(mlcr_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(mlcr_reg)]; - u32 in[MLX5_ST_SZ_DW(mlcr_reg)]; - memset(in, 0, sizeof(in)); MLX5_SET(mlcr_reg, in, local_port, 1); MLX5_SET(mlcr_reg, in, beacon_duration, beacon_duration); - return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MLCR, 0, 1); } @@ -244,15 +239,12 @@ EXPORT_SYMBOL_GPL(mlx5_toggle_port_link); int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status status) { - u32 in[MLX5_ST_SZ_DW(paos_reg)]; + u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(paos_reg)]; - memset(in, 0, sizeof(in)); - MLX5_SET(paos_reg, in, local_port, 1); MLX5_SET(paos_reg, in, admin_status, status); MLX5_SET(paos_reg, in, ase, 1); - return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PAOS, 0, 1); } @@ -261,19 +253,15 @@ EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status); int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status *status) { - u32 in[MLX5_ST_SZ_DW(paos_reg)]; + u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(paos_reg)]; int err; - memset(in, 0, sizeof(in)); - MLX5_SET(paos_reg, in, local_port, 1); - err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PAOS, 0, 0); if (err) return err; - *status = MLX5_GET(paos_reg, out, admin_status); return 0; } @@ -282,13 +270,10 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status); static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu, u16 *max_mtu, u16 *oper_mtu, u8 port) { - u32 in[MLX5_ST_SZ_DW(pmtu_reg)]; + u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pmtu_reg)]; - memset(in, 0, sizeof(in)); - MLX5_SET(pmtu_reg, in, local_port, port); - mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PMTU, 0, 0); @@ -302,14 +287,11 @@ static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu, int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port) { - u32 in[MLX5_ST_SZ_DW(pmtu_reg)]; + u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pmtu_reg)]; - memset(in, 0, sizeof(in)); - MLX5_SET(pmtu_reg, in, admin_mtu, mtu); MLX5_SET(pmtu_reg, in, local_port, port); - return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PMTU, 0, 1); } @@ -331,15 +313,12 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu); static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num) { + u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pmlp_reg)]; - u32 in[MLX5_ST_SZ_DW(pmlp_reg)]; int module_mapping; int err; - memset(in, 0, sizeof(in)); - MLX5_SET(pmlp_reg, in, local_port, 1); - err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PMLP, 0, 0); if (err) @@ -408,11 +387,9 @@ EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom); static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc, int pvlc_size, u8 local_port) { - u32 in[MLX5_ST_SZ_DW(pvlc_reg)]; + u32 in[MLX5_ST_SZ_DW(pvlc_reg)] = {0}; - memset(in, 0, sizeof(in)); MLX5_SET(pvlc_reg, in, local_port, local_port); - return mlx5_core_access_reg(dev, in, sizeof(in), pvlc, pvlc_size, MLX5_REG_PVLC, 0, 0); } @@ -458,10 +435,9 @@ EXPORT_SYMBOL_GPL(mlx5_core_query_ib_ppcnt); int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause) { - u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; - memset(in, 0, sizeof(in)); MLX5_SET(pfcc_reg, in, local_port, 1); MLX5_SET(pfcc_reg, in, pptx, tx_pause); MLX5_SET(pfcc_reg, in, pprx, rx_pause); @@ -474,13 +450,11 @@ EXPORT_SYMBOL_GPL(mlx5_set_port_pause); int mlx5_query_port_pause(struct mlx5_core_dev *dev, u32 *rx_pause, u32 *tx_pause) { - u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; int err; - memset(in, 0, sizeof(in)); MLX5_SET(pfcc_reg, in, local_port, 1); - err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PFCC, 0, 0); if (err) @@ -498,10 +472,9 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_pause); int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx) { - u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; - memset(in, 0, sizeof(in)); MLX5_SET(pfcc_reg, in, local_port, 1); MLX5_SET(pfcc_reg, in, pfctx, pfc_en_tx); MLX5_SET(pfcc_reg, in, pfcrx, pfc_en_rx); @@ -515,13 +488,11 @@ EXPORT_SYMBOL_GPL(mlx5_set_port_pfc); int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx) { - u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; int err; - memset(in, 0, sizeof(in)); MLX5_SET(pfcc_reg, in, local_port, 1); - err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PFCC, 0, 0); if (err) @@ -565,12 +536,11 @@ int mlx5_max_tc(struct mlx5_core_dev *mdev) int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc) { - u32 in[MLX5_ST_SZ_DW(qtct_reg)]; + u32 in[MLX5_ST_SZ_DW(qtct_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(qtct_reg)]; int err; int i; - memset(in, 0, sizeof(in)); for (i = 0; i < 8; i++) { if (prio_tc[i] > mlx5_max_tc(mdev)) return -EINVAL; @@ -615,11 +585,9 @@ static int mlx5_query_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *out, int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group) { - u32 in[MLX5_ST_SZ_DW(qetc_reg)]; + u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; int i; - memset(in, 0, sizeof(in)); - for (i = 0; i <= mlx5_max_tc(mdev); i++) { MLX5_SET(qetc_reg, in, tc_configuration[i].g, 1); MLX5_SET(qetc_reg, in, tc_configuration[i].group, tc_group[i]); @@ -631,11 +599,9 @@ EXPORT_SYMBOL_GPL(mlx5_set_port_tc_group); int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw) { - u32 in[MLX5_ST_SZ_DW(qetc_reg)]; + u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; int i; - memset(in, 0, sizeof(in)); - for (i = 0; i <= mlx5_max_tc(mdev); i++) { MLX5_SET(qetc_reg, in, tc_configuration[i].b, 1); MLX5_SET(qetc_reg, in, tc_configuration[i].bw_allocation, tc_bw[i]); @@ -649,12 +615,10 @@ int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev, u8 *max_bw_value, u8 *max_bw_units) { - u32 in[MLX5_ST_SZ_DW(qetc_reg)]; + u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; void *ets_tcn_conf; int i; - memset(in, 0, sizeof(in)); - MLX5_SET(qetc_reg, in, port_number, 1); for (i = 0; i <= mlx5_max_tc(mdev); i++) { @@ -699,35 +663,24 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit); int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode) { - u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)]; - u32 out[MLX5_ST_SZ_DW(set_wol_rol_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_wol_rol_out)] = {0}; MLX5_SET(set_wol_rol_in, in, opcode, MLX5_CMD_OP_SET_WOL_ROL); MLX5_SET(set_wol_rol_in, in, wol_mode_valid, 1); MLX5_SET(set_wol_rol_in, in, wol_mode, wol_mode); - - return mlx5_cmd_exec_check_status(mdev, in, sizeof(in), - out, sizeof(out)); + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL_GPL(mlx5_set_port_wol); int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode) { - u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)]; - u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)]; + u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)] = {0}; int err; - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); - MLX5_SET(query_wol_rol_in, in, opcode, MLX5_CMD_OP_QUERY_WOL_ROL); - - err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), - out, sizeof(out)); - + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); if (!err) *wol_mode = MLX5_GET(query_wol_rol_out, out, wol_mode); @@ -738,11 +691,9 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_wol); static int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out, int outlen) { - u32 in[MLX5_ST_SZ_DW(pcmr_reg)]; + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0}; - memset(in, 0, sizeof(in)); MLX5_SET(pcmr_reg, in, local_port, 1); - return mlx5_core_access_reg(mdev, in, sizeof(in), out, outlen, MLX5_REG_PCMR, 0, 0); } @@ -757,12 +708,10 @@ static int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen) int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable) { - u32 in[MLX5_ST_SZ_DW(pcmr_reg)]; + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0}; - memset(in, 0, sizeof(in)); MLX5_SET(pcmr_reg, in, local_port, 1); MLX5_SET(pcmr_reg, in, fcs_chk, enable); - return mlx5_set_ports_check(mdev, in, sizeof(in)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 50875a4e9d58..d0a4005fe63a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -281,7 +281,6 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; @@ -307,7 +306,6 @@ err_cmd: MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); - mlx5_cmd_status_to_err_v2(dout); return err; } EXPORT_SYMBOL_GPL(mlx5_core_create_qp); @@ -326,7 +324,6 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (err) return err; @@ -453,7 +450,6 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, return err; err = mlx5_cmd_exec(dev, mbox.in, mbox.inlen, mbox.out, mbox.outlen); - err = err ? : mlx5_cmd_status_to_err_v2(mbox.out); mbox_free(&mbox); return err; } @@ -478,13 +474,10 @@ int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *out, int outlen) { u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {0}; - int err; MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); MLX5_SET(query_qp_in, in, qpn, qp->qpn); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } EXPORT_SYMBOL_GPL(mlx5_core_qp_query); @@ -496,7 +489,6 @@ int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); if (!err) *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); return err; @@ -507,12 +499,10 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) { u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {0}; u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {0}; - int err; MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); @@ -522,11 +512,9 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, { u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0}; u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {0}; - int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - MLX5_SET(page_fault_resume_in, in, qpn, qpn); if (flags & MLX5_PAGE_FAULT_RESUME_REQUESTOR) @@ -538,8 +526,7 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, if (error) MLX5_SET(page_fault_resume_in, in, error, 1); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); #endif @@ -615,7 +602,7 @@ int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id) int err; MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); if (!err) *counter_id = MLX5_GET(alloc_q_counter_out, out, counter_set_id); @@ -631,8 +618,7 @@ int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id) MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id); - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter); @@ -644,7 +630,7 @@ int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id, MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); MLX5_SET(query_q_counter_in, in, clear, reset); MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id); - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_size); + return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); } EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index c07c28bd3d55..104902a93a0b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -63,19 +63,14 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev, u32 rate, u16 index) { - u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)]; - u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0}; MLX5_SET(set_rate_limit_in, in, opcode, MLX5_CMD_OP_SET_RATE_LIMIT); MLX5_SET(set_rate_limit_in, in, rate_limit_index, index); MLX5_SET(set_rate_limit_in, in, rate_limit, rate); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), - out, sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c index c07f4d01b70e..3099630015d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -175,8 +175,8 @@ static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(create_srq_in, create_in, opcode, MLX5_CMD_OP_CREATE_SRQ); - err = mlx5_cmd_exec_check_status(dev, create_in, inlen, create_out, - sizeof(create_out)); + err = mlx5_cmd_exec(dev, create_in, inlen, create_out, + sizeof(create_out)); kvfree(create_in); if (!err) srq->srqn = MLX5_GET(create_srq_out, create_out, srqn); @@ -194,8 +194,8 @@ static int destroy_srq_cmd(struct mlx5_core_dev *dev, MLX5_CMD_OP_DESTROY_SRQ); MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn); - return mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in), - srq_out, sizeof(srq_out)); + return mlx5_cmd_exec(dev, srq_in, sizeof(srq_in), + srq_out, sizeof(srq_out)); } static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, @@ -209,8 +209,8 @@ static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(arm_xrc_srq_in, srq_in, xrc_srqn, srq->srqn); MLX5_SET(arm_xrc_srq_in, srq_in, lwm, lwm); - return mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in), - srq_out, sizeof(srq_out)); + return mlx5_cmd_exec(dev, srq_in, sizeof(srq_in), + srq_out, sizeof(srq_out)); } static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, @@ -228,9 +228,8 @@ static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(query_srq_in, srq_in, opcode, MLX5_CMD_OP_QUERY_SRQ); MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn); - err = mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in), - srq_out, - MLX5_ST_SZ_BYTES(query_srq_out)); + err = mlx5_cmd_exec(dev, srq_in, sizeof(srq_in), + srq_out, MLX5_ST_SZ_BYTES(query_srq_out)); if (err) goto out; @@ -272,8 +271,8 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev, MLX5_CMD_OP_CREATE_XRC_SRQ); memset(create_out, 0, sizeof(create_out)); - err = mlx5_cmd_exec_check_status(dev, create_in, inlen, create_out, - sizeof(create_out)); + err = mlx5_cmd_exec(dev, create_in, inlen, create_out, + sizeof(create_out)); if (err) goto out; @@ -286,36 +285,30 @@ out: static int destroy_xrc_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq) { - u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]; - u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)]; - - memset(xrcsrq_in, 0, sizeof(xrcsrq_in)); - memset(xrcsrq_out, 0, sizeof(xrcsrq_out)); + u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {0}; + u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0}; MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode, MLX5_CMD_OP_DESTROY_XRC_SRQ); MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn); - return mlx5_cmd_exec_check_status(dev, xrcsrq_in, sizeof(xrcsrq_in), - xrcsrq_out, sizeof(xrcsrq_out)); + return mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in), + xrcsrq_out, sizeof(xrcsrq_out)); } static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm) { - u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]; - u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)]; - - memset(xrcsrq_in, 0, sizeof(xrcsrq_in)); - memset(xrcsrq_out, 0, sizeof(xrcsrq_out)); + u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0}; + u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0}; MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod, MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ); MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn); MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm, lwm); - return mlx5_cmd_exec_check_status(dev, xrcsrq_in, sizeof(xrcsrq_in), - xrcsrq_out, sizeof(xrcsrq_out)); + return mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in), + xrcsrq_out, sizeof(xrcsrq_out)); } static int query_xrc_srq_cmd(struct mlx5_core_dev *dev, @@ -335,9 +328,9 @@ static int query_xrc_srq_cmd(struct mlx5_core_dev *dev, MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode, MLX5_CMD_OP_QUERY_XRC_SRQ); MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn); - err = mlx5_cmd_exec_check_status(dev, xrcsrq_in, sizeof(xrcsrq_in), - xrcsrq_out, - MLX5_ST_SZ_BYTES(query_xrc_srq_out)); + + err = mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in), xrcsrq_out, + MLX5_ST_SZ_BYTES(query_xrc_srq_out)); if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index 28274a6fbafe..a00ff49eec18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -36,17 +36,14 @@ int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) { - u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)]; - u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)]; + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0}; int err; - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); - MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); if (!err) *tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain); @@ -57,29 +54,23 @@ EXPORT_SYMBOL(mlx5_core_alloc_transport_domain); void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) { - u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)]; - u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0}; MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); - - mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) { - u32 out[MLX5_ST_SZ_DW(create_rq_out)]; + u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {0}; int err; MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ); - - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *rqn = MLX5_GET(create_rq_out, out, rqn); @@ -95,21 +86,18 @@ int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen) MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ); memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_modify_rq); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) { - u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_rq_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0}; MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); MLX5_SET(destroy_rq_in, in, rqn, rqn); - - mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_rq); @@ -121,19 +109,17 @@ int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out) MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ); MLX5_SET(query_rq_in, in, rqn, rqn); - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } EXPORT_SYMBOL(mlx5_core_query_rq); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) { - u32 out[MLX5_ST_SZ_DW(create_sq_out)]; + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {0}; int err; MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); - - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *sqn = MLX5_GET(create_sq_out, out, sqn); @@ -142,27 +128,22 @@ int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_sq_out)]; + u32 out[MLX5_ST_SZ_DW(modify_sq_out)] = {0}; MLX5_SET(modify_sq_in, in, sqn, sqn); MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ); - - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_modify_sq); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn) { - u32 in[MLX5_ST_SZ_DW(destroy_sq_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_sq_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {0}; MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); MLX5_SET(destroy_sq_in, in, sqn, sqn); - - mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out) @@ -172,21 +153,20 @@ int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out) MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ); MLX5_SET(query_sq_in, in, sqn, sqn); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } EXPORT_SYMBOL(mlx5_core_query_sq); int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn) { - u32 out[MLX5_ST_SZ_DW(create_tir_out)]; + u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {0}; int err; MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *tirn = MLX5_GET(create_tir_out, out, tirn); @@ -197,39 +177,32 @@ EXPORT_SYMBOL(mlx5_core_create_tir); int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_tir_out)]; + u32 out[MLX5_ST_SZ_DW(modify_tir_out)] = {0}; MLX5_SET(modify_tir_in, in, tirn, tirn); MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR); - - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn) { - u32 in[MLX5_ST_SZ_DW(destroy_tir_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_tir_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {0}; MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); MLX5_SET(destroy_tir_in, in, tirn, tirn); - - mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_tir); int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn) { - u32 out[MLX5_ST_SZ_DW(create_tis_out)]; + u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {0}; int err; MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS); - - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *tisn = MLX5_GET(create_tis_out, out, tisn); @@ -245,34 +218,29 @@ int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, MLX5_SET(modify_tis_in, in, tisn, tisn); MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS); - return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_modify_tis); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn) { - u32 in[MLX5_ST_SZ_DW(destroy_tis_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_tis_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0}; MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); MLX5_SET(destroy_tis_in, in, tisn, tisn); - - mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_tis); int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rmpn) { - u32 out[MLX5_ST_SZ_DW(create_rmp_out)]; + u32 out[MLX5_ST_SZ_DW(create_rmp_out)] = {0}; int err; MLX5_SET(create_rmp_in, in, opcode, MLX5_CMD_OP_CREATE_RMP); - - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *rmpn = MLX5_GET(create_rmp_out, out, rmpn); @@ -281,38 +249,31 @@ int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen, int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_rmp_out)]; + u32 out[MLX5_ST_SZ_DW(modify_rmp_out)] = {0}; MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP); - - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn) { - u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {0}; MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP); MLX5_SET(destroy_rmp_in, in, rmpn, rmpn); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_rmp_in)]; + u32 in[MLX5_ST_SZ_DW(query_rmp_in)] = {0}; int outlen = MLX5_ST_SZ_BYTES(query_rmp_out); - memset(in, 0, sizeof(in)); MLX5_SET(query_rmp_in, in, opcode, MLX5_CMD_OP_QUERY_RMP); MLX5_SET(query_rmp_in, in, rmpn, rmpn); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm) @@ -347,13 +308,11 @@ int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm) int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *xsrqn) { - u32 out[MLX5_ST_SZ_DW(create_xrc_srq_out)]; + u32 out[MLX5_ST_SZ_DW(create_xrc_srq_out)] = {0}; int err; MLX5_SET(create_xrc_srq_in, in, opcode, MLX5_CMD_OP_CREATE_XRC_SRQ); - - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *xsrqn = MLX5_GET(create_xrc_srq_out, out, xrc_srqn); @@ -362,33 +321,25 @@ int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen, int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 xsrqn) { - u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0}; MLX5_SET(destroy_xrc_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRC_SRQ); MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, xsrqn); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)]; + u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)] = {0}; void *srqc; void *xrc_srqc; int err; - memset(in, 0, sizeof(in)); MLX5_SET(query_xrc_srq_in, in, opcode, MLX5_CMD_OP_QUERY_XRC_SRQ); MLX5_SET(query_xrc_srq_in, in, xrc_srqn, xsrqn); - - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), - out, - MLX5_ST_SZ_BYTES(query_xrc_srq_out)); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, + MLX5_ST_SZ_BYTES(query_xrc_srq_out)); if (!err) { xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, out, xrc_srq_context_entry); @@ -401,32 +352,25 @@ int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u32 *out) int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u16 lwm) { - u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]; - u32 out[MLX5_ST_SZ_DW(arm_xrc_srq_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0}; MLX5_SET(arm_xrc_srq_in, in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, xsrqn); MLX5_SET(arm_xrc_srq_in, in, lwm, lwm); MLX5_SET(arm_xrc_srq_in, in, op_mod, MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ); - - return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqtn) { - u32 out[MLX5_ST_SZ_DW(create_rqt_out)]; + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0}; int err; MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); - - memset(out, 0, sizeof(out)); - err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (!err) *rqtn = MLX5_GET(create_rqt_out, out, rqtn); @@ -437,25 +381,20 @@ EXPORT_SYMBOL(mlx5_core_create_rqt); int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_rqt_out)]; + u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {0}; MLX5_SET(modify_rqt_in, in, rqtn, rqtn); MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); - - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) { - u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)]; - u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0}; MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); - - mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_core_destroy_rqt); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index d0a0e0bc77e4..ab0b896621a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -50,11 +50,8 @@ int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn) MLX5_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR); err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - err = err ? : mlx5_cmd_status_to_err_v2(out); - if (err) - return err; - - *uarn = MLX5_GET(alloc_uar_out, out, uar); + if (!err) + *uarn = MLX5_GET(alloc_uar_out, out, uar); return err; } EXPORT_SYMBOL(mlx5_cmd_alloc_uar); @@ -63,12 +60,10 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn) { u32 out[MLX5_ST_SZ_DW(dealloc_uar_out)] = {0}; u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)] = {0}; - int err; MLX5_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR); MLX5_SET(dealloc_uar_in, in, uar, uarn); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - return err ? : mlx5_cmd_status_to_err_v2(out); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL(mlx5_cmd_free_uar); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 21365d06982b..3593bf78caf4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -39,10 +39,7 @@ static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u32 *out, int outlen) { - int err; - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {0}; MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); @@ -51,11 +48,7 @@ static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, if (vport) MLX5_SET(query_vport_state_in, in, other_vport, 1); - err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen); - if (err) - mlx5_core_warn(mdev, "MLX5_CMD_OP_QUERY_VPORT_STATE failed\n"); - - return err; + return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); } u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) @@ -81,58 +74,43 @@ EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 state) { - u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)]; - u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)]; - int err; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)] = {0}; MLX5_SET(modify_vport_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VPORT_STATE); MLX5_SET(modify_vport_state_in, in, op_mod, opmod); MLX5_SET(modify_vport_state_in, in, vport_number, vport); - if (vport) MLX5_SET(modify_vport_state_in, in, other_vport, 1); - MLX5_SET(modify_vport_state_in, in, admin_state, state); - err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, - sizeof(out)); - if (err) - mlx5_core_warn(mdev, "MLX5_CMD_OP_MODIFY_VPORT_STATE failed\n"); - - return err; + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } EXPORT_SYMBOL_GPL(mlx5_modify_vport_admin_state); static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, u32 *out, int outlen) { - u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)]; - - memset(in, 0, sizeof(in)); + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0}; MLX5_SET(query_nic_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); - MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); if (vport) MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); - return mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); } static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)]; + u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0}; MLX5_SET(modify_nic_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec_check_status(mdev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); } void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, @@ -254,7 +232,7 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, u8 addr_list[][ETH_ALEN], int *list_size) { - u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)]; + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0}; void *nic_vport_ctx; int max_list_size; int req_list_size; @@ -278,7 +256,6 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout); - memset(in, 0, sizeof(in)); out = kzalloc(out_sz, GFP_KERNEL); if (!out) return -ENOMEM; @@ -291,7 +268,7 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, if (vport) MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_sz); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); if (err) goto out; @@ -361,7 +338,7 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, ether_addr_copy(curr_mac, addr_list[i]); } - err = mlx5_cmd_exec_check_status(dev, in, in_sz, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); kfree(in); return err; } @@ -406,7 +383,7 @@ int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, if (vport) MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); - err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_sz); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); if (err) goto out; @@ -473,7 +450,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, MLX5_SET(vlan_layout, vlan_addr, vlan, vlans[i]); } - err = mlx5_cmd_exec_check_status(dev, in, in_sz, out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); kfree(in); return err; } @@ -631,10 +608,6 @@ int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, if (err) goto out; - err = mlx5_cmd_status_to_err_v2(out); - if (err) - goto out; - tmp = out + MLX5_ST_SZ_BYTES(query_hca_vport_gid_out); gid->global.subnet_prefix = tmp->global.subnet_prefix; gid->global.interface_id = tmp->global.interface_id; @@ -700,10 +673,6 @@ int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport, if (err) goto out; - err = mlx5_cmd_status_to_err_v2(out); - if (err) - goto out; - pkarr = MLX5_ADDR_OF(query_hca_vport_pkey_out, out, pkey); for (i = 0; i < nout; i++, pkey++, pkarr += MLX5_ST_SZ_BYTES(pkey)) *pkey = MLX5_GET_PR(pkey, pkarr, pkey); @@ -721,7 +690,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, struct mlx5_hca_vport_context *rep) { int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); - int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)]; + int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {0}; int is_group_manager; void *out; void *ctx; @@ -729,7 +698,6 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); - memset(in, 0, sizeof(in)); out = kzalloc(out_sz, GFP_KERNEL); if (!out) return -ENOMEM; @@ -750,9 +718,6 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, MLX5_SET(query_hca_vport_context_in, in, port_num, port_num); err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); - if (err) - goto ex; - err = mlx5_cmd_status_to_err_v2(out); if (err) goto ex; @@ -969,10 +934,6 @@ int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, MLX5_SET(query_vport_counter_in, in, port_num, port_num); err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); - if (err) - goto free; - err = mlx5_cmd_status_to_err_v2(out); - free: kvfree(in); return err; @@ -1035,11 +996,6 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, MLX5_SET(hca_vport_context, ctx, qkey_violation_counter, req->qkey_violation_counter); MLX5_SET(hca_vport_context, ctx, pkey_violation_counter, req->pkey_violation_counter); err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); - if (err) - goto ex; - - err = mlx5_cmd_status_to_err_v2(out); - ex: kfree(in); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c index e25a73ed2981..07a9ba6cfc70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c @@ -46,41 +46,24 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv) static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) { - struct mlx5_outbox_hdr *hdr; - int err; - - u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]; - u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0}; MLX5_SET(add_vxlan_udp_dport_in, in, opcode, MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT); MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port); - - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); - if (err) - return err; - - hdr = (struct mlx5_outbox_hdr *)out; - return hdr->status ? -ENOMEM : 0; + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } static int mlx5e_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port) { - u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)]; - u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)]; - - memset(in, 0, sizeof(in)); - memset(out, 0, sizeof(out)); + u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0}; MLX5_SET(delete_vxlan_udp_dport_in, in, opcode, MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port); - - return mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, - sizeof(out)); + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fb002db1e2f0..2575070c836e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -398,33 +398,6 @@ enum { MLX5_MAX_SGE_RD = (512 - 16 - 16) / 16 }; -struct mlx5_inbox_hdr { - __be16 opcode; - u8 rsvd[4]; - __be16 opmod; -}; - -struct mlx5_outbox_hdr { - u8 status; - u8 rsvd[3]; - __be32 syndrome; -}; - -struct mlx5_cmd_query_adapter_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_cmd_query_adapter_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[24]; - u8 intapin; - u8 rsvd1[13]; - __be16 vsd_vendor_id; - u8 vsd[208]; - u8 vsd_psid[16]; -}; - enum mlx5_odp_transport_cap_bits { MLX5_ODP_SUPPORT_SEND = 1 << 31, MLX5_ODP_SUPPORT_RECV = 1 << 30, @@ -457,7 +430,6 @@ struct mlx5_cmd_layout { u8 status_own; }; - struct health_buffer { __be32 assert_var[5]; __be32 rsvd0[3]; @@ -819,93 +791,6 @@ struct mlx5_cqe128 { struct mlx5_cqe64 cqe64; }; -struct mlx5_srq_ctx { - u8 state_log_sz; - u8 rsvd0[3]; - __be32 flags_xrcd; - __be32 pgoff_cqn; - u8 rsvd1[4]; - u8 log_pg_sz; - u8 rsvd2[7]; - __be32 pd; - __be16 lwm; - __be16 wqe_cnt; - u8 rsvd3[8]; - __be64 db_record; -}; - -struct mlx5_create_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_srqn; - u8 rsvd0[4]; - struct mlx5_srq_ctx ctx; - u8 rsvd1[208]; - __be64 pas[0]; -}; - -struct mlx5_create_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 srqn; - u8 rsvd[4]; -}; - -struct mlx5_destroy_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 srqn; - u8 rsvd[4]; -}; - -struct mlx5_destroy_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 srqn; - u8 rsvd0[4]; -}; - -struct mlx5_query_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; - struct mlx5_srq_ctx ctx; - u8 rsvd1[32]; - __be64 pas[0]; -}; - -struct mlx5_arm_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 srqn; - __be16 rsvd; - __be16 lwm; -}; - -struct mlx5_arm_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_enable_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_enable_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_disable_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_disable_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - enum { MLX5_MKEY_STATUS_FREE = 1 << 6, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 173817187abb..ebe57abf3324 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -771,14 +771,15 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); -int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); -int mlx5_cmd_status_to_err_v2(void *ptr); -int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); + int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size, mlx5_cmd_cbk_t callback, void *context); +void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); + +int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); -- cgit v1.2.3 From 9def7121bed3be8a9d126c900ca7067647bc4789 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 3 Aug 2016 17:27:30 +0300 Subject: net/mlx5: Enable setting minimum inline header mode for VFs Implement the low-level part of the PF side in setting minimum inline header mode for VFs. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 20 ++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 2 +- include/linux/mlx5/vport.h | 2 ++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 3593bf78caf4..525f17af108e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -125,6 +125,26 @@ void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline); +int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, + u16 vport, u8 min_inline) +{ + u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0}; + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *nic_vport_ctx; + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.min_inline, 1); + MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1); + + nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, + in, nic_vport_context); + MLX5_SET(nic_vport_context, nic_vport_ctx, + min_wqe_inline_mode, min_inline); + + return mlx5_modify_nic_vport_context(mdev, in, inlen); +} + int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cb94ac5b8420..7a8ef0af94e7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4724,7 +4724,7 @@ struct mlx5_ifc_modify_nic_vport_field_select_bits { u8 reserved_at_0[0x16]; u8 node_guid[0x1]; u8 port_guid[0x1]; - u8 reserved_at_18[0x1]; + u8 min_inline[0x1]; u8 mtu[0x1]; u8 change_event[0x1]; u8 promisc[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index e087b7d047ac..451b0bde9083 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -45,6 +45,8 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline); +int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, + u16 vport, u8 min_inline); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, u16 vport, u8 *addr); int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); -- cgit v1.2.3 From 7adbde2035c2e5baf2f6a90eba11813db4813a67 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 3 Aug 2016 15:08:33 +0300 Subject: net/mlx5: Update mlx5_ifc.h for vxlan encap/decap Add the required definitions related to vxlan encap/decap. Signed-off-by: Hadar Hen Zion Signed-off-by: Ilya Lesokhin Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 105 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7a8ef0af94e7..3766110e13ea 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -212,6 +212,8 @@ enum { MLX5_CMD_OP_DEALLOC_FLOW_COUNTER = 0x93a, MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b, MLX5_CMD_OP_MODIFY_FLOW_TABLE = 0x93c, + MLX5_CMD_OP_ALLOC_ENCAP_HEADER = 0x93d, + MLX5_CMD_OP_DEALLOC_ENCAP_HEADER = 0x93e, MLX5_CMD_OP_MAX }; @@ -281,7 +283,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 modify_root[0x1]; u8 identified_miss_table_mode[0x1]; u8 flow_table_modify[0x1]; - u8 reserved_at_7[0x19]; + u8 encap[0x1]; + u8 decap[0x1]; + u8 reserved_at_9[0x17]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; @@ -512,7 +516,15 @@ struct mlx5_ifc_e_switch_cap_bits { u8 nic_vport_node_guid_modify[0x1]; u8 nic_vport_port_guid_modify[0x1]; - u8 reserved_at_20[0x7e0]; + u8 vxlan_encap_decap[0x1]; + u8 nvgre_encap_decap[0x1]; + u8 reserved_at_22[0x9]; + u8 log_max_encap_headers[0x5]; + u8 reserved_2b[0x6]; + u8 max_encap_header_size[0xa]; + + u8 reserved_40[0x7c0]; + }; struct mlx5_ifc_qos_cap_bits { @@ -2067,6 +2079,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_DROP = 0x2, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4, MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8, + MLX5_FLOW_CONTEXT_ACTION_ENCAP = 0x10, + MLX5_FLOW_CONTEXT_ACTION_DECAP = 0x20, }; struct mlx5_ifc_flow_context_bits { @@ -2086,7 +2100,9 @@ struct mlx5_ifc_flow_context_bits { u8 reserved_at_a0[0x8]; u8 flow_counter_list_size[0x18]; - u8 reserved_at_c0[0x140]; + u8 encap_id[0x20]; + + u8 reserved_at_e0[0x120]; struct mlx5_ifc_fte_match_param_bits match_value; @@ -4216,6 +4232,85 @@ struct mlx5_ifc_query_eq_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_encap_header_in_bits { + u8 reserved_at_0[0x5]; + u8 header_type[0x3]; + u8 reserved_at_8[0xe]; + u8 encap_header_size[0xa]; + + u8 reserved_at_20[0x10]; + u8 encap_header[2][0x8]; + + u8 more_encap_header[0][0x8]; +}; + +struct mlx5_ifc_query_encap_header_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0xa0]; + + struct mlx5_ifc_encap_header_in_bits encap_header[0]; +}; + +struct mlx5_ifc_query_encap_header_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 encap_id[0x20]; + + u8 reserved_at_60[0xa0]; +}; + +struct mlx5_ifc_alloc_encap_header_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 encap_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_encap_header_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0xa0]; + + struct mlx5_ifc_encap_header_in_bits encap_header; +}; + +struct mlx5_ifc_dealloc_encap_header_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_encap_header_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_20[0x10]; + u8 op_mod[0x10]; + + u8 encap_id[0x20]; + + u8 reserved_60[0x20]; +}; + struct mlx5_ifc_query_dct_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6102,7 +6197,9 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 reserved_at_a0[0x20]; - u8 reserved_at_c0[0x4]; + u8 encap_en[0x1]; + u8 decap_en[0x1]; + u8 reserved_at_c2[0x2]; u8 table_miss_mode[0x4]; u8 level[0x8]; u8 reserved_at_d0[0x8]; -- cgit v1.2.3 From 92e47ba8839bacc185db89f3b11cd8036193e6a9 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 13 Aug 2016 22:35:36 +0800 Subject: netfilter: conntrack: simplify the code by using nf_conntrack_get_ht Since commit 64b87639c9cb ("netfilter: conntrack: fix race between nf_conntrack proc read and hash resize") introduce the nf_conntrack_get_ht, so there's no need to check nf_conntrack_generation again and again to get the hash table and hash size. And convert nf_conntrack_get_ht to inline function here. Suggested-by: Pablo Neira Ayuso Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 20 ++++++++++++++ include/net/netfilter/nf_conntrack_core.h | 3 -- net/netfilter/nf_conntrack_core.c | 46 +++++++------------------------ 3 files changed, 30 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 445b019c2078..2a127480d4cc 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -303,9 +303,29 @@ struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); + +extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; +extern seqcount_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; +/* must be called with rcu read lock held */ +static inline void +nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) +{ + struct hlist_nulls_head *hptr; + unsigned int sequence, hsz; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hsz = nf_conntrack_htable_size; + hptr = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + *hash = hptr; + *hsize = hsz; +} + struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags); diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 79d7ac5c9740..62e17d1319ff 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -51,8 +51,6 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto); -void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize); - /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, @@ -83,7 +81,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, #define CONNTRACK_LOCKS 1024 -extern struct hlist_nulls_head *nf_conntrack_hash; extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 22558b7ff7cd..aeba28c5512b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -74,7 +74,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); static __read_mostly struct kmem_cache *nf_conntrack_cachep; static __read_mostly spinlock_t nf_conntrack_locks_all_lock; -static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; @@ -162,6 +161,7 @@ static void nf_conntrack_all_unlock(void) unsigned int nf_conntrack_htable_size __read_mostly; unsigned int nf_conntrack_max __read_mostly; +seqcount_t nf_conntrack_generation __read_mostly; DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); @@ -478,23 +478,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } -/* must be called with rcu read lock held */ -void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) -{ - struct hlist_nulls_head *hptr; - unsigned int sequence, hsz; - - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hsz = nf_conntrack_htable_size; - hptr = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); - - *hash = hptr; - *hsize = hsz; -} -EXPORT_SYMBOL_GPL(nf_conntrack_get_ht); - /* * Warning : * - Caller must take a reference on returned object @@ -507,14 +490,11 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket, sequence; + unsigned int bucket, hsize; begin: - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - bucket = scale_hash(hash); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { if (nf_ct_key_equal(h, tuple, zone, net)) { @@ -820,18 +800,15 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; - unsigned int hash, sequence; + unsigned int hash, hsize; struct hlist_nulls_node *n; struct nf_conn *ct; zone = nf_ct_zone(ignored_conntrack); rcu_read_lock(); - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = hash_conntrack(net, tuple); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + hash = __hash_conntrack(net, tuple, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -897,14 +874,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash) for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned hash, sequence, drops; + unsigned int hash, hsize, drops; rcu_read_lock(); - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = scale_hash(_hash++); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + hash = reciprocal_scale(_hash++, hsize); drops = early_drop_list(net, &ct_hash[hash]); rcu_read_unlock(); -- cgit v1.2.3 From 43a0c6751a322847cb6fa0ab8cbf77a1d08bfc0a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 15 Aug 2016 14:51:01 -0700 Subject: strparser: Stream parser for messages This patch introduces a utility for parsing application layer protocol messages in a TCP stream. This is a generalization of the mechanism implemented of Kernel Connection Multiplexor. The API includes a context structure, a set of callbacks, utility functions, and a data ready function. A stream parser instance is defined by a strparse structure that is bound to a TCP socket. The function to initialize the structure is: int strp_init(struct strparser *strp, struct sock *csk, struct strp_callbacks *cb); csk is the TCP socket being bound to and cb are the parser callbacks. The upper layer calls strp_tcp_data_ready when data is ready on the lower socket for strparser to process. This should be called from a data_ready callback that is set on the socket: void strp_tcp_data_ready(struct strparser *strp); A parser is bound to a TCP socket by setting data_ready function to strp_tcp_data_ready so that all receive indications on the socket go through the parser. This is assumes that sk_user_data is set to the strparser structure. There are four callbacks. - parse_msg is called to parse the message (returns length or error). - rcv_msg is called when a complete message has been received - read_sock_done is called when data_ready function exits - abort_parser is called to abort the parser The input to parse_msg is an skbuff which contains next message under construction. The backend processing of parse_msg will parse the application layer protocol headers to determine the length of the message in the stream. The possible return values are: >0 : indicates length of successfully parsed message 0 : indicates more data must be received to parse the message -ESTRPIPE : current message should not be processed by the kernel, return control of the socket to userspace which can proceed to read the messages itself other < 0 : Error is parsing, give control back to userspace assuming that synchronzation is lost and the stream is unrecoverable (application expected to close TCP socket) In the case of error return (< 0) strparse will stop the parser and report and error to userspace. The application must deal with the error. To handle the error the strparser is unbound from the TCP socket. If the error indicates that the stream TCP socket is at recoverable point (ESTRPIPE) then the application can read the TCP socket to process the stream. Once the application has dealt with the exceptions in the stream, it may again bind the socket to a strparser to continue data operations. Note that ENODATA may be returned to the application. In this case parse_msg returned -ESTRPIPE, however strparser was unable to maintain synchronization of the stream (i.e. some of the message in question was already read by the parser). strp_pause and strp_unpause are used to provide flow control. For instance, if rcv_msg is called but the upper layer can't immediately consume the message it can hold the message and pause strparser. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/strparser.h | 145 ++++++++++++++ net/Kconfig | 1 + net/Makefile | 1 + net/strparser/Kconfig | 4 + net/strparser/Makefile | 1 + net/strparser/strparser.c | 492 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 644 insertions(+) create mode 100644 include/net/strparser.h create mode 100644 net/strparser/Kconfig create mode 100644 net/strparser/Makefile create mode 100644 net/strparser/strparser.c (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h new file mode 100644 index 000000000000..fdb3d6746cc4 --- /dev/null +++ b/include/net/strparser.h @@ -0,0 +1,145 @@ +/* + * Stream Parser + * + * Copyright (c) 2016 Tom Herbert + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#ifndef __NET_STRPARSER_H_ +#define __NET_STRPARSER_H_ + +#include +#include + +#define STRP_STATS_ADD(stat, count) ((stat) += (count)) +#define STRP_STATS_INCR(stat) ((stat)++) + +struct strp_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned int rx_mem_fail; + unsigned int rx_need_more_hdr; + unsigned int rx_msg_too_big; + unsigned int rx_msg_timeouts; + unsigned int rx_bad_hdr_len; +}; + +struct strp_aggr_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned int rx_mem_fail; + unsigned int rx_need_more_hdr; + unsigned int rx_msg_too_big; + unsigned int rx_msg_timeouts; + unsigned int rx_bad_hdr_len; + unsigned int rx_aborts; + unsigned int rx_interrupted; + unsigned int rx_unrecov_intr; +}; + +struct strparser; + +/* Callbacks are called with lock held for the attached socket */ +struct strp_callbacks { + int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); + void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); + int (*read_sock_done)(struct strparser *strp, int err); + void (*abort_parser)(struct strparser *strp, int err); +}; + +struct strp_rx_msg { + int full_len; + int offset; +}; + +static inline struct strp_rx_msg *strp_rx_msg(struct sk_buff *skb) +{ + return (struct strp_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +/* Structure for an attached lower socket */ +struct strparser { + struct sock *sk; + + u32 rx_stopped : 1; + u32 rx_paused : 1; + u32 rx_aborted : 1; + u32 rx_interrupted : 1; + u32 rx_unrecov_intr : 1; + + struct sk_buff **rx_skb_nextp; + struct timer_list rx_msg_timer; + struct sk_buff *rx_skb_head; + unsigned int rx_need_bytes; + struct delayed_work rx_delayed_work; + struct work_struct rx_work; + struct strp_stats stats; + struct strp_callbacks cb; +}; + +/* Must be called with lock held for attached socket */ +static inline void strp_pause(struct strparser *strp) +{ + strp->rx_paused = 1; +} + +/* May be called without holding lock for attached socket */ +static inline void strp_unpause(struct strparser *strp) +{ + strp->rx_paused = 0; +} + +static inline void save_strp_stats(struct strparser *strp, + struct strp_aggr_stats *agg_stats) +{ + /* Save psock statistics in the mux when psock is being unattached. */ + +#define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += \ + strp->stats._stat) + SAVE_PSOCK_STATS(rx_msgs); + SAVE_PSOCK_STATS(rx_bytes); + SAVE_PSOCK_STATS(rx_mem_fail); + SAVE_PSOCK_STATS(rx_need_more_hdr); + SAVE_PSOCK_STATS(rx_msg_too_big); + SAVE_PSOCK_STATS(rx_msg_timeouts); + SAVE_PSOCK_STATS(rx_bad_hdr_len); +#undef SAVE_PSOCK_STATS + + if (strp->rx_aborted) + agg_stats->rx_aborts++; + if (strp->rx_interrupted) + agg_stats->rx_interrupted++; + if (strp->rx_unrecov_intr) + agg_stats->rx_unrecov_intr++; +} + +static inline void aggregate_strp_stats(struct strp_aggr_stats *stats, + struct strp_aggr_stats *agg_stats) +{ +#define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) + SAVE_PSOCK_STATS(rx_msgs); + SAVE_PSOCK_STATS(rx_bytes); + SAVE_PSOCK_STATS(rx_mem_fail); + SAVE_PSOCK_STATS(rx_need_more_hdr); + SAVE_PSOCK_STATS(rx_msg_too_big); + SAVE_PSOCK_STATS(rx_msg_timeouts); + SAVE_PSOCK_STATS(rx_bad_hdr_len); + SAVE_PSOCK_STATS(rx_aborts); + SAVE_PSOCK_STATS(rx_interrupted); + SAVE_PSOCK_STATS(rx_unrecov_intr); +#undef SAVE_PSOCK_STATS + +} + +void strp_done(struct strparser *strp); +void strp_stop(struct strparser *strp); +void strp_check_rcv(struct strparser *strp); +int strp_init(struct strparser *strp, struct sock *csk, + struct strp_callbacks *cb); +void strp_tcp_data_ready(struct strparser *strp); + +#endif /* __NET_STRPARSER_H_ */ diff --git a/net/Kconfig b/net/Kconfig index c2cdbce629bd..7b6cd340b72b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -369,6 +369,7 @@ source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" source "net/kcm/Kconfig" +source "net/strparser/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 9bd20bb86cc6..4cafaa2b4667 100644 --- a/net/Makefile +++ b/net/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_AF_RXRPC) += rxrpc/ obj-$(CONFIG_AF_KCM) += kcm/ +obj-$(CONFIG_STREAM_PARSER) += strparser/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ obj-$(CONFIG_DECNET) += decnet/ diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig new file mode 100644 index 000000000000..6cff3f6d0c3a --- /dev/null +++ b/net/strparser/Kconfig @@ -0,0 +1,4 @@ + +config STREAM_PARSER + tristate + default n diff --git a/net/strparser/Makefile b/net/strparser/Makefile new file mode 100644 index 000000000000..858a126ebaa0 --- /dev/null +++ b/net/strparser/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_STREAM_PARSER) += strparser.o diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c new file mode 100644 index 000000000000..fd688c0a7744 --- /dev/null +++ b/net/strparser/strparser.c @@ -0,0 +1,492 @@ +/* + * Stream Parser + * + * Copyright (c) 2016 Tom Herbert + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct workqueue_struct *strp_wq; + +struct _strp_rx_msg { + /* Internal cb structure. struct strp_rx_msg must be first for passing + * to upper layer. + */ + struct strp_rx_msg strp; + int accum_len; + int early_eaten; +}; + +static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb) +{ + return (struct _strp_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +/* Lower lock held */ +static void strp_abort_rx_strp(struct strparser *strp, int err) +{ + struct sock *csk = strp->sk; + + /* Unrecoverable error in receive */ + + del_timer(&strp->rx_msg_timer); + + if (strp->rx_stopped) + return; + + strp->rx_stopped = 1; + + /* Report an error on the lower socket */ + csk->sk_err = err; + csk->sk_error_report(csk); +} + +static void strp_start_rx_timer(struct strparser *strp) +{ + if (strp->sk->sk_rcvtimeo) + mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo); +} + +/* Lower lock held */ +static void strp_parser_err(struct strparser *strp, int err, + read_descriptor_t *desc) +{ + desc->error = err; + kfree_skb(strp->rx_skb_head); + strp->rx_skb_head = NULL; + strp->cb.abort_parser(strp, err); +} + +/* Lower socket lock held */ +static int strp_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct strparser *strp = (struct strparser *)desc->arg.data; + struct _strp_rx_msg *rxm; + struct sk_buff *head, *skb; + size_t eaten = 0, cand_len; + ssize_t extra; + int err; + bool cloned_orig = false; + + if (strp->rx_paused) + return 0; + + head = strp->rx_skb_head; + if (head) { + /* Message already in progress */ + + rxm = _strp_rx_msg(head); + if (unlikely(rxm->early_eaten)) { + /* Already some number of bytes on the receive sock + * data saved in rx_skb_head, just indicate they + * are consumed. + */ + eaten = orig_len <= rxm->early_eaten ? + orig_len : rxm->early_eaten; + rxm->early_eaten -= eaten; + + return eaten; + } + + if (unlikely(orig_offset)) { + /* Getting data with a non-zero offset when a message is + * in progress is not expected. If it does happen, we + * need to clone and pull since we can't deal with + * offsets in the skbs for a message expect in the head. + */ + orig_skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!orig_skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + if (!pskb_pull(orig_skb, orig_offset)) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + kfree_skb(orig_skb); + desc->error = -ENOMEM; + return 0; + } + cloned_orig = true; + orig_offset = 0; + } + + if (!strp->rx_skb_nextp) { + /* We are going to append to the frags_list of head. + * Need to unshare the frag_list. + */ + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = err; + return 0; + } + + if (unlikely(skb_shinfo(head)->frag_list)) { + /* We can't append to an sk_buff that already + * has a frag_list. We create a new head, point + * the frag_list of that to the old head, and + * then are able to use the old head->next for + * appending to the message. + */ + if (WARN_ON(head->next)) { + desc->error = -EINVAL; + return 0; + } + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + skb->len = head->len; + skb->data_len = head->len; + skb->truesize = head->truesize; + *_strp_rx_msg(skb) = *_strp_rx_msg(head); + strp->rx_skb_nextp = &head->next; + skb_shinfo(skb)->frag_list = head; + strp->rx_skb_head = skb; + head = skb; + } else { + strp->rx_skb_nextp = + &skb_shinfo(head)->frag_list; + } + } + } + + while (eaten < orig_len) { + /* Always clone since we will consume something */ + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + break; + } + + cand_len = orig_len - eaten; + + head = strp->rx_skb_head; + if (!head) { + head = skb; + strp->rx_skb_head = head; + /* Will set rx_skb_nextp on next packet if needed */ + strp->rx_skb_nextp = NULL; + rxm = _strp_rx_msg(head); + memset(rxm, 0, sizeof(*rxm)); + rxm->strp.offset = orig_offset + eaten; + } else { + /* Unclone since we may be appending to an skb that we + * already share a frag_list with. + */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = err; + break; + } + + rxm = _strp_rx_msg(head); + *strp->rx_skb_nextp = skb; + strp->rx_skb_nextp = &skb->next; + head->data_len += skb->len; + head->len += skb->len; + head->truesize += skb->truesize; + } + + if (!rxm->strp.full_len) { + ssize_t len; + + len = (*strp->cb.parse_msg)(strp, head); + + if (!len) { + /* Need more header to determine length */ + if (!rxm->accum_len) { + /* Start RX timer for new message */ + strp_start_rx_timer(strp); + } + rxm->accum_len += cand_len; + eaten += cand_len; + STRP_STATS_INCR(strp->stats.rx_need_more_hdr); + WARN_ON(eaten != orig_len); + break; + } else if (len < 0) { + if (len == -ESTRPIPE && rxm->accum_len) { + len = -ENODATA; + strp->rx_unrecov_intr = 1; + } else { + strp->rx_interrupted = 1; + } + strp_parser_err(strp, err, desc); + break; + } else if (len > strp->sk->sk_rcvbuf) { + /* Message length exceeds maximum allowed */ + STRP_STATS_INCR(strp->stats.rx_msg_too_big); + strp_parser_err(strp, -EMSGSIZE, desc); + break; + } else if (len <= (ssize_t)head->len - + skb->len - rxm->strp.offset) { + /* Length must be into new skb (and also + * greater than zero) + */ + STRP_STATS_INCR(strp->stats.rx_bad_hdr_len); + strp_parser_err(strp, -EPROTO, desc); + break; + } + + rxm->strp.full_len = len; + } + + extra = (ssize_t)(rxm->accum_len + cand_len) - + rxm->strp.full_len; + + if (extra < 0) { + /* Message not complete yet. */ + if (rxm->strp.full_len - rxm->accum_len > + tcp_inq(strp->sk)) { + /* Don't have the whole messages in the socket + * buffer. Set strp->rx_need_bytes to wait for + * the rest of the message. Also, set "early + * eaten" since we've already buffered the skb + * but don't consume yet per tcp_read_sock. + */ + + if (!rxm->accum_len) { + /* Start RX timer for new message */ + strp_start_rx_timer(strp); + } + + strp->rx_need_bytes = rxm->strp.full_len - + rxm->accum_len; + rxm->accum_len += cand_len; + rxm->early_eaten = cand_len; + STRP_STATS_ADD(strp->stats.rx_bytes, cand_len); + desc->count = 0; /* Stop reading socket */ + break; + } + rxm->accum_len += cand_len; + eaten += cand_len; + WARN_ON(eaten != orig_len); + break; + } + + /* Positive extra indicates ore bytes than needed for the + * message + */ + + WARN_ON(extra > cand_len); + + eaten += (cand_len - extra); + + /* Hurray, we have a new message! */ + del_timer(&strp->rx_msg_timer); + strp->rx_skb_head = NULL; + STRP_STATS_INCR(strp->stats.rx_msgs); + + /* Give skb to upper layer */ + strp->cb.rcv_msg(strp, head); + + if (unlikely(strp->rx_paused)) { + /* Upper layer paused strp */ + break; + } + } + + if (cloned_orig) + kfree_skb(orig_skb); + + STRP_STATS_ADD(strp->stats.rx_bytes, eaten); + + return eaten; +} + +static int default_read_sock_done(struct strparser *strp, int err) +{ + return err; +} + +/* Called with lock held on lower socket */ +static int strp_tcp_read_sock(struct strparser *strp) +{ + read_descriptor_t desc; + + desc.arg.data = strp; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + /* sk should be locked here, so okay to do tcp_read_sock */ + tcp_read_sock(strp->sk, &desc, strp_tcp_recv); + + desc.error = strp->cb.read_sock_done(strp, desc.error); + + return desc.error; +} + +/* Lower sock lock held */ +void strp_tcp_data_ready(struct strparser *strp) +{ + struct sock *csk = strp->sk; + + if (unlikely(strp->rx_stopped)) + return; + + /* This check is needed to synchronize with do_strp_rx_work. + * do_strp_rx_work acquires a process lock (lock_sock) whereas + * the lock held here is bh_lock_sock. The two locks can be + * held by different threads at the same time, but bh_lock_sock + * allows a thread in BH context to safely check if the process + * lock is held. In this case, if the lock is held, queue work. + */ + if (sock_owned_by_user(csk)) { + queue_work(strp_wq, &strp->rx_work); + return; + } + + if (strp->rx_paused) + return; + + if (strp->rx_need_bytes) { + if (tcp_inq(csk) >= strp->rx_need_bytes) + strp->rx_need_bytes = 0; + else + return; + } + + if (strp_tcp_read_sock(strp) == -ENOMEM) + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_tcp_data_ready); + +static void do_strp_rx_work(struct strparser *strp) +{ + read_descriptor_t rd_desc; + struct sock *csk = strp->sk; + + /* We need the read lock to synchronize with strp_tcp_data_ready. We + * need the socket lock for calling tcp_read_sock. + */ + lock_sock(csk); + + if (unlikely(csk->sk_user_data != strp)) + goto out; + + if (unlikely(strp->rx_stopped)) + goto out; + + if (strp->rx_paused) + goto out; + + rd_desc.arg.data = strp; + + if (strp_tcp_read_sock(strp) == -ENOMEM) + queue_work(strp_wq, &strp->rx_work); + +out: + release_sock(csk); +} + +static void strp_rx_work(struct work_struct *w) +{ + do_strp_rx_work(container_of(w, struct strparser, rx_work)); +} + +static void strp_rx_msg_timeout(unsigned long arg) +{ + struct strparser *strp = (struct strparser *)arg; + + /* Message assembly timed out */ + STRP_STATS_INCR(strp->stats.rx_msg_timeouts); + lock_sock(strp->sk); + strp->cb.abort_parser(strp, ETIMEDOUT); + release_sock(strp->sk); +} + +int strp_init(struct strparser *strp, struct sock *csk, + struct strp_callbacks *cb) +{ + if (!cb || !cb->rcv_msg || !cb->parse_msg) + return -EINVAL; + + memset(strp, 0, sizeof(*strp)); + + strp->sk = csk; + + setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout, + (unsigned long)strp); + + INIT_WORK(&strp->rx_work, strp_rx_work); + + strp->cb.rcv_msg = cb->rcv_msg; + strp->cb.parse_msg = cb->parse_msg; + strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; + strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp; + + return 0; +} +EXPORT_SYMBOL_GPL(strp_init); + +/* strp must already be stopped so that strp_tcp_recv will no longer be called. + * Note that strp_done is not called with the lower socket held. + */ +void strp_done(struct strparser *strp) +{ + WARN_ON(!strp->rx_stopped); + + del_timer_sync(&strp->rx_msg_timer); + cancel_work_sync(&strp->rx_work); + + if (strp->rx_skb_head) { + kfree_skb(strp->rx_skb_head); + strp->rx_skb_head = NULL; + } +} +EXPORT_SYMBOL_GPL(strp_done); + +void strp_stop(struct strparser *strp) +{ + strp->rx_stopped = 1; +} +EXPORT_SYMBOL_GPL(strp_stop); + +void strp_check_rcv(struct strparser *strp) +{ + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_check_rcv); + +static int __init strp_mod_init(void) +{ + strp_wq = create_singlethread_workqueue("kstrp"); + + return 0; +} + +static void __exit strp_mod_exit(void) +{ +} +module_init(strp_mod_init); +module_exit(strp_mod_exit); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 9b73896a81dc68a638a011877b7344b252f92276 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 15 Aug 2016 14:51:02 -0700 Subject: kcm: Use stream parser Adapt KCM to use the stream parser. This mostly involves removing the RX handling and setting up the strparser using the interface. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 37 +--- net/ipv6/ila/ila_common.c | 1 - net/kcm/Kconfig | 1 + net/kcm/kcmproc.c | 44 +++-- net/kcm/kcmsock.c | 456 ++++++++-------------------------------------- 5 files changed, 116 insertions(+), 423 deletions(-) (limited to 'include') diff --git a/include/net/kcm.h b/include/net/kcm.h index 2840b5825dcc..2a8965819db0 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -13,6 +13,7 @@ #include #include +#include #include extern unsigned int kcm_net_id; @@ -21,16 +22,8 @@ extern unsigned int kcm_net_id; #define KCM_STATS_INCR(stat) ((stat)++) struct kcm_psock_stats { - unsigned long long rx_msgs; - unsigned long long rx_bytes; unsigned long long tx_msgs; unsigned long long tx_bytes; - unsigned int rx_aborts; - unsigned int rx_mem_fail; - unsigned int rx_need_more_hdr; - unsigned int rx_msg_too_big; - unsigned int rx_msg_timeouts; - unsigned int rx_bad_hdr_len; unsigned long long reserved; unsigned long long unreserved; unsigned int tx_aborts; @@ -64,13 +57,6 @@ struct kcm_tx_msg { struct sk_buff *last_skb; }; -struct kcm_rx_msg { - int full_len; - int accum_len; - int offset; - int early_eaten; -}; - /* Socket structure for KCM client sockets */ struct kcm_sock { struct sock sk; @@ -87,6 +73,7 @@ struct kcm_sock { struct work_struct tx_work; struct list_head wait_psock_list; struct sk_buff *seq_skb; + u32 tx_stopped : 1; /* Don't use bit fields here, these are set under different locks */ bool tx_wait; @@ -104,11 +91,11 @@ struct bpf_prog; /* Structure for an attached lower socket */ struct kcm_psock { struct sock *sk; + struct strparser strp; struct kcm_mux *mux; int index; u32 tx_stopped : 1; - u32 rx_stopped : 1; u32 done : 1; u32 unattaching : 1; @@ -121,18 +108,12 @@ struct kcm_psock { struct kcm_psock_stats stats; /* Receive */ - struct sk_buff *rx_skb_head; - struct sk_buff **rx_skb_nextp; - struct sk_buff *ready_rx_msg; struct list_head psock_ready_list; - struct work_struct rx_work; - struct delayed_work rx_delayed_work; struct bpf_prog *bpf_prog; struct kcm_sock *rx_kcm; unsigned long long saved_rx_bytes; unsigned long long saved_rx_msgs; - struct timer_list rx_msg_timer; - unsigned int rx_need_bytes; + struct sk_buff *ready_rx_msg; /* Transmit */ struct kcm_sock *tx_kcm; @@ -146,6 +127,7 @@ struct kcm_net { struct mutex mutex; struct kcm_psock_stats aggregate_psock_stats; struct kcm_mux_stats aggregate_mux_stats; + struct strp_aggr_stats aggregate_strp_stats; struct list_head mux_list; int count; }; @@ -163,6 +145,7 @@ struct kcm_mux { struct kcm_mux_stats stats; struct kcm_psock_stats aggregate_psock_stats; + struct strp_aggr_stats aggregate_strp_stats; /* Receive */ spinlock_t rx_lock ____cacheline_aligned_in_smp; @@ -190,14 +173,6 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, /* Save psock statistics in the mux when psock is being unattached. */ #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) - SAVE_PSOCK_STATS(rx_msgs); - SAVE_PSOCK_STATS(rx_bytes); - SAVE_PSOCK_STATS(rx_aborts); - SAVE_PSOCK_STATS(rx_mem_fail); - SAVE_PSOCK_STATS(rx_need_more_hdr); - SAVE_PSOCK_STATS(rx_msg_too_big); - SAVE_PSOCK_STATS(rx_msg_timeouts); - SAVE_PSOCK_STATS(rx_bad_hdr_len); SAVE_PSOCK_STATS(tx_msgs); SAVE_PSOCK_STATS(tx_bytes); SAVE_PSOCK_STATS(reserved); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index ec9efbcdad35..aba0998ddbfb 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -172,6 +172,5 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); -MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert "); MODULE_LICENSE("GPL"); diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 5db94d940ecc..87fca36e6c47 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -3,6 +3,7 @@ config AF_KCM tristate "KCM sockets" depends on INET select BPF_SYSCALL + select STREAM_PARSER ---help--- KCM (Kernel Connection Multiplexor) sockets provide a method for multiplexing messages of a message based application diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 16c2e03bd388..47e445364f4f 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, - psock->stats.rx_msgs, - psock->stats.rx_bytes, + psock->strp.stats.rx_msgs, + psock->strp.stats.rx_bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, @@ -170,9 +170,12 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, if (psock->tx_stopped) seq_puts(seq, "TxStop "); - if (psock->rx_stopped) + if (psock->strp.rx_stopped) seq_puts(seq, "RxStop "); + if (psock->strp.rx_paused) + seq_puts(seq, "RxPause "); + if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); @@ -275,6 +278,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) { struct kcm_psock_stats psock_stats; struct kcm_mux_stats mux_stats; + struct strp_aggr_stats strp_stats; struct kcm_mux *mux; struct kcm_psock *psock; struct net *net = seq->private; @@ -282,20 +286,28 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) memset(&mux_stats, 0, sizeof(mux_stats)); memset(&psock_stats, 0, sizeof(psock_stats)); + memset(&strp_stats, 0, sizeof(strp_stats)); mutex_lock(&knet->mutex); aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); aggregate_psock_stats(&knet->aggregate_psock_stats, &psock_stats); + aggregate_strp_stats(&knet->aggregate_strp_stats, + &strp_stats); list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { spin_lock_bh(&mux->lock); aggregate_mux_stats(&mux->stats, &mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &psock_stats); - list_for_each_entry(psock, &mux->psocks, psock_list) + aggregate_strp_stats(&mux->aggregate_strp_stats, + &strp_stats); + list_for_each_entry(psock, &mux->psocks, psock_list) { aggregate_psock_stats(&psock->stats, &psock_stats); + save_strp_stats(&psock->strp, &strp_stats); + } + spin_unlock_bh(&mux->lock); } @@ -328,7 +340,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) mux_stats.rx_ready_drops); seq_printf(seq, - "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", "Psock", "RX-Msgs", "RX-Bytes", @@ -337,6 +349,8 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) "Reserved", "Unreserved", "RX-Aborts", + "RX-Intr", + "RX-Unrecov", "RX-MemFail", "RX-NeedMor", "RX-BadLen", @@ -345,20 +359,22 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) "TX-Aborts"); seq_printf(seq, - "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", + "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", - psock_stats.rx_msgs, - psock_stats.rx_bytes, + strp_stats.rx_msgs, + strp_stats.rx_bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, - psock_stats.rx_aborts, - psock_stats.rx_mem_fail, - psock_stats.rx_need_more_hdr, - psock_stats.rx_bad_hdr_len, - psock_stats.rx_msg_too_big, - psock_stats.rx_msg_timeouts, + strp_stats.rx_aborts, + strp_stats.rx_interrupted, + strp_stats.rx_unrecov_intr, + strp_stats.rx_mem_fail, + strp_stats.rx_need_more_hdr, + strp_stats.rx_bad_hdr_len, + strp_stats.rx_msg_too_big, + strp_stats.rx_msg_timeouts, psock_stats.tx_aborts); return 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index cb39e05b166c..eedbe404af35 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1,3 +1,13 @@ +/* + * Kernel Connection Multiplexor + * + * Copyright (c) 2016 Tom Herbert + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + #include #include #include @@ -35,38 +45,12 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) return (struct kcm_tx_msg *)skb->cb; } -static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) -{ - return (struct kcm_rx_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); -} - static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; csk->sk_error_report(csk); } -/* Callback lock held */ -static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, - struct sk_buff *skb) -{ - struct sock *csk = psock->sk; - - /* Unrecoverable error in receive */ - - del_timer(&psock->rx_msg_timer); - - if (psock->rx_stopped) - return; - - psock->rx_stopped = 1; - KCM_STATS_INCR(psock->stats.rx_aborts); - - /* Report an error on the lower socket */ - report_csk_error(csk, err); -} - static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { @@ -109,12 +93,13 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { - KCM_STATS_ADD(mux->stats.rx_bytes, - psock->stats.rx_bytes - psock->saved_rx_bytes); + STRP_STATS_ADD(mux->stats.rx_bytes, + psock->strp.stats.rx_bytes - + psock->saved_rx_bytes); mux->stats.rx_msgs += - psock->stats.rx_msgs - psock->saved_rx_msgs; - psock->saved_rx_msgs = psock->stats.rx_msgs; - psock->saved_rx_bytes = psock->stats.rx_bytes; + psock->strp.stats.rx_msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->strp.stats.rx_msgs; + psock->saved_rx_bytes = psock->strp.stats.rx_bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, @@ -167,11 +152,11 @@ static void kcm_rcv_ready(struct kcm_sock *kcm) */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; - /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); - queue_work(kcm_wq, &psock->rx_work); + strp_unpause(&psock->strp); + strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ @@ -285,6 +270,7 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; + strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); @@ -353,276 +339,6 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, spin_unlock_bh(&mux->rx_lock); } -static void kcm_start_rx_timer(struct kcm_psock *psock) -{ - if (psock->sk->sk_rcvtimeo) - mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); -} - -/* Macro to invoke filter function. */ -#define KCM_RUN_FILTER(prog, ctx) \ - (*prog->bpf_func)(ctx, prog->insnsi) - -/* Lower socket lock held */ -static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, - unsigned int orig_offset, size_t orig_len) -{ - struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; - struct kcm_rx_msg *rxm; - struct kcm_sock *kcm; - struct sk_buff *head, *skb; - size_t eaten = 0, cand_len; - ssize_t extra; - int err; - bool cloned_orig = false; - - if (psock->ready_rx_msg) - return 0; - - head = psock->rx_skb_head; - if (head) { - /* Message already in progress */ - - rxm = kcm_rx_msg(head); - if (unlikely(rxm->early_eaten)) { - /* Already some number of bytes on the receive sock - * data saved in rx_skb_head, just indicate they - * are consumed. - */ - eaten = orig_len <= rxm->early_eaten ? - orig_len : rxm->early_eaten; - rxm->early_eaten -= eaten; - - return eaten; - } - - if (unlikely(orig_offset)) { - /* Getting data with a non-zero offset when a message is - * in progress is not expected. If it does happen, we - * need to clone and pull since we can't deal with - * offsets in the skbs for a message expect in the head. - */ - orig_skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!orig_skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - return 0; - } - if (!pskb_pull(orig_skb, orig_offset)) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - kfree_skb(orig_skb); - desc->error = -ENOMEM; - return 0; - } - cloned_orig = true; - orig_offset = 0; - } - - if (!psock->rx_skb_nextp) { - /* We are going to append to the frags_list of head. - * Need to unshare the frag_list. - */ - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = err; - return 0; - } - - if (unlikely(skb_shinfo(head)->frag_list)) { - /* We can't append to an sk_buff that already - * has a frag_list. We create a new head, point - * the frag_list of that to the old head, and - * then are able to use the old head->next for - * appending to the message. - */ - if (WARN_ON(head->next)) { - desc->error = -EINVAL; - return 0; - } - - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - return 0; - } - skb->len = head->len; - skb->data_len = head->len; - skb->truesize = head->truesize; - *kcm_rx_msg(skb) = *kcm_rx_msg(head); - psock->rx_skb_nextp = &head->next; - skb_shinfo(skb)->frag_list = head; - psock->rx_skb_head = skb; - head = skb; - } else { - psock->rx_skb_nextp = - &skb_shinfo(head)->frag_list; - } - } - } - - while (eaten < orig_len) { - /* Always clone since we will consume something */ - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - break; - } - - cand_len = orig_len - eaten; - - head = psock->rx_skb_head; - if (!head) { - head = skb; - psock->rx_skb_head = head; - /* Will set rx_skb_nextp on next packet if needed */ - psock->rx_skb_nextp = NULL; - rxm = kcm_rx_msg(head); - memset(rxm, 0, sizeof(*rxm)); - rxm->offset = orig_offset + eaten; - } else { - /* Unclone since we may be appending to an skb that we - * already share a frag_list with. - */ - err = skb_unclone(skb, GFP_ATOMIC); - if (err) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = err; - break; - } - - rxm = kcm_rx_msg(head); - *psock->rx_skb_nextp = skb; - psock->rx_skb_nextp = &skb->next; - head->data_len += skb->len; - head->len += skb->len; - head->truesize += skb->truesize; - } - - if (!rxm->full_len) { - ssize_t len; - - len = KCM_RUN_FILTER(psock->bpf_prog, head); - - if (!len) { - /* Need more header to determine length */ - if (!rxm->accum_len) { - /* Start RX timer for new message */ - kcm_start_rx_timer(psock); - } - rxm->accum_len += cand_len; - eaten += cand_len; - KCM_STATS_INCR(psock->stats.rx_need_more_hdr); - WARN_ON(eaten != orig_len); - break; - } else if (len > psock->sk->sk_rcvbuf) { - /* Message length exceeds maximum allowed */ - KCM_STATS_INCR(psock->stats.rx_msg_too_big); - desc->error = -EMSGSIZE; - psock->rx_skb_head = NULL; - kcm_abort_rx_psock(psock, EMSGSIZE, head); - break; - } else if (len <= (ssize_t)head->len - - skb->len - rxm->offset) { - /* Length must be into new skb (and also - * greater than zero) - */ - KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); - desc->error = -EPROTO; - psock->rx_skb_head = NULL; - kcm_abort_rx_psock(psock, EPROTO, head); - break; - } - - rxm->full_len = len; - } - - extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; - - if (extra < 0) { - /* Message not complete yet. */ - if (rxm->full_len - rxm->accum_len > - tcp_inq(psock->sk)) { - /* Don't have the whole messages in the socket - * buffer. Set psock->rx_need_bytes to wait for - * the rest of the message. Also, set "early - * eaten" since we've already buffered the skb - * but don't consume yet per tcp_read_sock. - */ - - if (!rxm->accum_len) { - /* Start RX timer for new message */ - kcm_start_rx_timer(psock); - } - - psock->rx_need_bytes = rxm->full_len - - rxm->accum_len; - rxm->accum_len += cand_len; - rxm->early_eaten = cand_len; - KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); - desc->count = 0; /* Stop reading socket */ - break; - } - rxm->accum_len += cand_len; - eaten += cand_len; - WARN_ON(eaten != orig_len); - break; - } - - /* Positive extra indicates ore bytes than needed for the - * message - */ - - WARN_ON(extra > cand_len); - - eaten += (cand_len - extra); - - /* Hurray, we have a new message! */ - del_timer(&psock->rx_msg_timer); - psock->rx_skb_head = NULL; - KCM_STATS_INCR(psock->stats.rx_msgs); - -try_queue: - kcm = reserve_rx_kcm(psock, head); - if (!kcm) { - /* Unable to reserve a KCM, message is held in psock. */ - break; - } - - if (kcm_queue_rcv_skb(&kcm->sk, head)) { - /* Should mean socket buffer full */ - unreserve_rx_kcm(psock, false); - goto try_queue; - } - } - - if (cloned_orig) - kfree_skb(orig_skb); - - KCM_STATS_ADD(psock->stats.rx_bytes, eaten); - - return eaten; -} - -/* Called with lock held on lower socket */ -static int psock_tcp_read_sock(struct kcm_psock *psock) -{ - read_descriptor_t desc; - - desc.arg.data = psock; - desc.error = 0; - desc.count = 1; /* give more than one skb per call */ - - /* sk should be locked here, so okay to do tcp_read_sock */ - tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); - - unreserve_rx_kcm(psock, true); - - return desc.error; -} - /* Lower sock lock held */ static void psock_tcp_data_ready(struct sock *sk) { @@ -631,65 +347,49 @@ static void psock_tcp_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; - if (unlikely(!psock || psock->rx_stopped)) - goto out; - - if (psock->ready_rx_msg) - goto out; - - if (psock->rx_need_bytes) { - if (tcp_inq(sk) >= psock->rx_need_bytes) - psock->rx_need_bytes = 0; - else - goto out; - } - - if (psock_tcp_read_sock(psock) == -ENOMEM) - queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + if (likely(psock)) + strp_tcp_data_ready(&psock->strp); -out: read_unlock_bh(&sk->sk_callback_lock); } -static void do_psock_rx_work(struct kcm_psock *psock) +/* Called with lower sock held */ +static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { - read_descriptor_t rd_desc; - struct sock *csk = psock->sk; - - /* We need the read lock to synchronize with psock_tcp_data_ready. We - * need the socket lock for calling tcp_read_sock. - */ - lock_sock(csk); - read_lock_bh(&csk->sk_callback_lock); - - if (unlikely(csk->sk_user_data != psock)) - goto out; - - if (unlikely(psock->rx_stopped)) - goto out; - - if (psock->ready_rx_msg) - goto out; - - rd_desc.arg.data = psock; + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + struct kcm_sock *kcm; - if (psock_tcp_read_sock(psock) == -ENOMEM) - queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); +try_queue: + kcm = reserve_rx_kcm(psock, skb); + if (!kcm) { + /* Unable to reserve a KCM, message is held in psock and strp + * is paused. + */ + return; + } -out: - read_unlock_bh(&csk->sk_callback_lock); - release_sock(csk); + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Should mean socket buffer full */ + unreserve_rx_kcm(psock, false); + goto try_queue; + } } -static void psock_rx_work(struct work_struct *w) +static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { - do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + struct bpf_prog *prog = psock->bpf_prog; + + return (*prog->bpf_func)(skb, prog->insnsi); } -static void psock_rx_delayed_work(struct work_struct *w) +static int kcm_read_sock_done(struct strparser *strp, int err) { - do_psock_rx_work(container_of(w, struct kcm_psock, - rx_delayed_work.work)); + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + + unreserve_rx_kcm(psock, true); + + return err; } static void psock_tcp_state_change(struct sock *sk) @@ -713,14 +413,13 @@ static void psock_tcp_write_space(struct sock *sk) psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; - mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; - if (kcm) + if (kcm && !unlikely(kcm->tx_stopped)) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); @@ -1411,7 +1110,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, struct kcm_sock *kcm = kcm_sk(sk); int err = 0; long timeo; - struct kcm_rx_msg *rxm; + struct strp_rx_msg *rxm; int copied = 0; struct sk_buff *skb; @@ -1425,7 +1124,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, /* Okay, have a message on the receive queue */ - rxm = kcm_rx_msg(skb); + rxm = strp_rx_msg(skb); if (len > rxm->full_len) len = rxm->full_len; @@ -1481,7 +1180,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); long timeo; - struct kcm_rx_msg *rxm; + struct strp_rx_msg *rxm; int err = 0; ssize_t copied; struct sk_buff *skb; @@ -1498,7 +1197,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, /* Okay, have a message on the receive queue */ - rxm = kcm_rx_msg(skb); + rxm = strp_rx_msg(skb); if (len > rxm->full_len) len = rxm->full_len; @@ -1674,15 +1373,6 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) spin_unlock_bh(&mux->rx_lock); } -static void kcm_rx_msg_timeout(unsigned long arg) -{ - struct kcm_psock *psock = (struct kcm_psock *)arg; - - /* Message assembly timed out */ - KCM_STATS_INCR(psock->stats.rx_msg_timeouts); - kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); -} - static int kcm_attach(struct socket *sock, struct socket *csock, struct bpf_prog *prog) { @@ -1692,6 +1382,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; + struct strp_callbacks cb; if (csock->ops->family != PF_INET && csock->ops->family != PF_INET6) @@ -1713,11 +1404,12 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, - (unsigned long)psock); + cb.rcv_msg = kcm_rcv_strparser; + cb.abort_parser = NULL; + cb.parse_msg = kcm_parse_func_strparser; + cb.read_sock_done = kcm_read_sock_done; - INIT_WORK(&psock->rx_work, psock_rx_work); - INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); + strp_init(&psock->strp, csk, &cb); sock_hold(csk); @@ -1750,7 +1442,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, spin_unlock_bh(&mux->lock); /* Schedule RX work in case there are already bytes queued */ - queue_work(kcm_wq, &psock->rx_work); + strp_check_rcv(&psock->strp); return 0; } @@ -1785,6 +1477,7 @@ out: return err; } +/* Lower socket lock held */ static void kcm_unattach(struct kcm_psock *psock) { struct sock *csk = psock->sk; @@ -1798,7 +1491,7 @@ static void kcm_unattach(struct kcm_psock *psock) csk->sk_data_ready = psock->save_data_ready; csk->sk_write_space = psock->save_write_space; csk->sk_state_change = psock->save_state_change; - psock->rx_stopped = 1; + strp_stop(&psock->strp); if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); @@ -1821,18 +1514,14 @@ static void kcm_unattach(struct kcm_psock *psock) write_unlock_bh(&csk->sk_callback_lock); - del_timer_sync(&psock->rx_msg_timer); - cancel_work_sync(&psock->rx_work); - cancel_delayed_work_sync(&psock->rx_delayed_work); + strp_done(&psock->strp); bpf_prog_put(psock->bpf_prog); - kfree_skb(psock->rx_skb_head); - psock->rx_skb_head = NULL; - spin_lock_bh(&mux->lock); aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); + save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); KCM_STATS_INCR(mux->stats.psock_unattach); @@ -1915,6 +1604,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) spin_unlock_bh(&mux->lock); + /* Lower socket lock should already be held */ kcm_unattach(psock); err = 0; @@ -2059,8 +1749,11 @@ static void release_mux(struct kcm_mux *mux) /* Release psocks */ list_for_each_entry_safe(psock, tmp_psock, &mux->psocks, psock_list) { - if (!WARN_ON(psock->unattaching)) + if (!WARN_ON(psock->unattaching)) { + lock_sock(psock->strp.sk); kcm_unattach(psock); + release_sock(psock->strp.sk); + } } if (WARN_ON(mux->psocks_cnt)) @@ -2072,6 +1765,8 @@ static void release_mux(struct kcm_mux *mux) aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &knet->aggregate_psock_stats); + aggregate_strp_stats(&mux->aggregate_strp_stats, + &knet->aggregate_strp_stats); list_del_rcu(&mux->kcm_mux_list); knet->count--; mutex_unlock(&knet->mutex); @@ -2151,6 +1846,13 @@ static int kcm_release(struct socket *sock) * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); + + /* Set tx_stopped. This is checked when psock is bound to a kcm and we + * get a writespace callback. This prevents further work being queued + * from the callback (unbinding the psock occurs after canceling work. + */ + kcm->tx_stopped = 1; + release_sock(sk); spin_lock_bh(&mux->lock); -- cgit v1.2.3 From 83b502a12e82d0ae97907d415496fbafe044f0ce Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 4 Aug 2016 17:32:02 +0300 Subject: net/mlx5: Modify RQ bitmask from mlx5 ifc Use mlx5 ifc MODIFY_BITMASK_VSD in mlx5e_modify_rq_vsd and expose counter set capability bit in hca caps structure. Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++- include/linux/mlx5/driver.h | 4 ---- include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 10fa12aa063f..9e36c1538904 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -489,7 +489,8 @@ static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY); - MLX5_SET64(modify_rq_in, in, modify_bitmask, MLX5_RQ_BITMASK_VSD); + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD); MLX5_SET(rqc, rqc, vsd, vsd); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ebe57abf3324..0ea78b5edbb2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -48,10 +48,6 @@ #include #include -enum { - MLX5_RQ_BITMASK_VSD = 1 << 1, -}; - enum { MLX5_BOARD_ID_LEN = 64, MLX5_MAX_NAME_LEN = 16, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3766110e13ea..e1f8e3491867 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -779,7 +779,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 out_of_seq_cnt[0x1]; u8 vport_counters[0x1]; u8 retransmission_q_counters[0x1]; - u8 reserved_at_183[0x3]; + u8 reserved_at_183[0x1]; + u8 modify_rq_counter_set_id[0x1]; + u8 reserved_at_185[0x1]; u8 max_qp_cnt[0xa]; u8 pkey_table_size[0x10]; @@ -4750,6 +4752,11 @@ struct mlx5_ifc_modify_rq_out_bits { u8 reserved_at_40[0x40]; }; +enum { + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID = 1ULL << 3, +}; + struct mlx5_ifc_modify_rq_in_bits { u8 opcode[0x10]; u8 reserved_at_10[0x10]; -- cgit v1.2.3 From 2e353b3468ecb1d12a44aaf35888f7de47d5c047 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Sun, 3 Jul 2016 14:57:33 +0300 Subject: net/mlx5: Update struct mlx5_ifc_xrqc_bits Update struct mlx5_ifc_xrqc_bits according to last specification Signed-off-by: Artemy Kovalyov Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e1f8e3491867..5f150c849a8f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2829,7 +2829,7 @@ struct mlx5_ifc_xrqc_bits { struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context; - u8 reserved_at_180[0x180]; + u8 reserved_at_180[0x200]; struct mlx5_ifc_wq_bits wq; }; -- cgit v1.2.3 From 8cca30a7f914fe363fa9700715619ca5c8cb38cc Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sun, 26 Jun 2016 12:43:24 +0300 Subject: net/mlx5: Expose mlx5e_link_mode The mlx5e_link_mode enumeration will also be used in mlx5_ib for RoCE. This patch moves the enumeration to the mlx5 driver port header file. Signed-off-by: Noa Osherovich Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 34 ---------------------------- include/linux/mlx5/port.h | 33 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 1b495efa7490..61902b147339 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -651,40 +651,6 @@ struct mlx5e_priv { void *ppriv; }; -enum mlx5e_link_mode { - MLX5E_1000BASE_CX_SGMII = 0, - MLX5E_1000BASE_KX = 1, - MLX5E_10GBASE_CX4 = 2, - MLX5E_10GBASE_KX4 = 3, - MLX5E_10GBASE_KR = 4, - MLX5E_20GBASE_KR2 = 5, - MLX5E_40GBASE_CR4 = 6, - MLX5E_40GBASE_KR4 = 7, - MLX5E_56GBASE_R4 = 8, - MLX5E_10GBASE_CR = 12, - MLX5E_10GBASE_SR = 13, - MLX5E_10GBASE_ER = 14, - MLX5E_40GBASE_SR4 = 15, - MLX5E_40GBASE_LR4 = 16, - MLX5E_50GBASE_SR2 = 18, - MLX5E_100GBASE_CR4 = 20, - MLX5E_100GBASE_SR4 = 21, - MLX5E_100GBASE_KR4 = 22, - MLX5E_100GBASE_LR4 = 23, - MLX5E_100BASE_TX = 24, - MLX5E_1000BASE_T = 25, - MLX5E_10GBASE_T = 26, - MLX5E_25GBASE_CR = 27, - MLX5E_25GBASE_KR = 28, - MLX5E_25GBASE_SR = 29, - MLX5E_50GBASE_CR2 = 30, - MLX5E_50GBASE_KR2 = 31, - MLX5E_LINK_MODES_NUMBER, -}; - -#define MLX5E_PROT_MASK(link_mode) (1 << link_mode) - - void mlx5e_build_ptys2ethtool_map(void); void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw); diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index e3012cc64b8a..6f876a4770f6 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -61,6 +61,39 @@ enum mlx5_an_status { #define MLX5_I2C_ADDR_HIGH 0x51 #define MLX5_EEPROM_PAGE_LENGTH 256 +enum mlx5e_link_mode { + MLX5E_1000BASE_CX_SGMII = 0, + MLX5E_1000BASE_KX = 1, + MLX5E_10GBASE_CX4 = 2, + MLX5E_10GBASE_KX4 = 3, + MLX5E_10GBASE_KR = 4, + MLX5E_20GBASE_KR2 = 5, + MLX5E_40GBASE_CR4 = 6, + MLX5E_40GBASE_KR4 = 7, + MLX5E_56GBASE_R4 = 8, + MLX5E_10GBASE_CR = 12, + MLX5E_10GBASE_SR = 13, + MLX5E_10GBASE_ER = 14, + MLX5E_40GBASE_SR4 = 15, + MLX5E_40GBASE_LR4 = 16, + MLX5E_50GBASE_SR2 = 18, + MLX5E_100GBASE_CR4 = 20, + MLX5E_100GBASE_SR4 = 21, + MLX5E_100GBASE_KR4 = 22, + MLX5E_100GBASE_LR4 = 23, + MLX5E_100BASE_TX = 24, + MLX5E_1000BASE_T = 25, + MLX5E_10GBASE_T = 26, + MLX5E_25GBASE_CR = 27, + MLX5E_25GBASE_KR = 28, + MLX5E_25GBASE_SR = 29, + MLX5E_50GBASE_CR2 = 30, + MLX5E_50GBASE_KR2 = 31, + MLX5E_LINK_MODES_NUMBER, +}; + +#define MLX5E_PROT_MASK(link_mode) (1 << link_mode) + int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, int ptys_size, int proto_mask, u8 local_port); -- cgit v1.2.3 From d5beb7f2aff4a60237fd97a98d49a78c9045b8f2 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Thu, 2 Jun 2016 10:47:53 +0300 Subject: net/mlx5: Separate query_port_proto_oper for IB and EN Replaced mlx5_query_port_proto_oper with separate functions per link type. The functions should take different arguments so no point in trying to unite them. Signed-off-by: Noa Osherovich Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/port.c | 32 ++++++++++++++++++-------- include/linux/mlx5/port.h | 7 +++--- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6fb77d791045..f02a975320bd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -748,8 +748,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, &props->active_width); if (err) goto out; - err = mlx5_query_port_proto_oper(mdev, &props->active_speed, MLX5_PTYS_IB, - port); + err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port); if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 2ee28ad8f918..34e7184e23c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -175,25 +175,39 @@ int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_query_port_link_width_oper); -int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, - u8 *proto_oper, int proto_mask, - u8 local_port) +int mlx5_query_port_eth_proto_oper(struct mlx5_core_dev *dev, + u32 *proto_oper, u8 local_port) { u32 out[MLX5_ST_SZ_DW(ptys_reg)]; int err; - err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, local_port); + err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, + local_port); if (err) return err; - if (proto_mask == MLX5_PTYS_EN) - *proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper); - else - *proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper); + *proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper); + + return 0; +} +EXPORT_SYMBOL(mlx5_query_port_eth_proto_oper); + +int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev, + u8 *proto_oper, u8 local_port) +{ + u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + int err; + + err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB, + local_port); + if (err) + return err; + + *proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper); return 0; } -EXPORT_SYMBOL_GPL(mlx5_query_port_proto_oper); +EXPORT_SYMBOL(mlx5_query_port_ib_proto_oper); int mlx5_set_port_ptys(struct mlx5_core_dev *dev, bool an_disable, u32 proto_admin, int proto_mask) diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 6f876a4770f6..b3065acd20b4 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -103,9 +103,10 @@ int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, u32 *proto_admin, int proto_mask); int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, u8 *link_width_oper, u8 local_port); -int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, - u8 *proto_oper, int proto_mask, - u8 local_port); +int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev, + u8 *proto_oper, u8 local_port); +int mlx5_query_port_eth_proto_oper(struct mlx5_core_dev *dev, + u32 *proto_oper, u8 local_port); int mlx5_set_port_ptys(struct mlx5_core_dev *dev, bool an_disable, u32 proto_admin, int proto_mask); void mlx5_toggle_port_link(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 84df61ebc69bdc466180e02d654e9b0284781288 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Tue, 10 May 2016 13:47:50 +0300 Subject: net/mlx5: Add HW interfaces used by LAG Exposed LAG commands enum and layouts: - CREATE_LAG HW enters LAG mode: RoCE traffic from port two is received on PF0 core dev. Allows to set tx_affinity (tx port) for QPs and TISes. Allows to port remap QPs and TISes, overriding their tx_affinity behavior. - MODIFY_LAG Remap QPs and TISes to another port. - QUERY_LAG Query whether LAG mode is active. - DESTROY_LAG HW exits LAG mode, returning to non-LAG behavior. - CREATE_VPORT_LAG Merge Ethernet flow steering, such that traffic received on port two jumps to PF0 root flow table. Available only in LAG mode. - DESTROY_VPORT_LAG Ethernet flow steering returns to non-LAG behavior. Caps added: - lag_master Driver is in charge of managing the LAG. This is currently the only option. - num_lag_ports LAG is supported only if this field's value is 2. Other fields: - QP/TIS tx port affinity During LAG, this field controls on which port a QP or TIS resides. - TIS strict tx affinity When this field is set, the TIS will not be subject to port remap by CREATE_LAG/MODIFY_LAG. - LAG demux flow table Flow table used for redirecting non user-space traffic back to PF1 root flow table, if the packet was received on port two. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 12 ++ include/linux/mlx5/mlx5_ifc.h | 166 ++++++++++++++++++++++++-- 2 files changed, 171 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 00bec609aae2..6388bc0d9b97 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -285,6 +285,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN: case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT: case MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY: + case MLX5_CMD_OP_DESTROY_LAG: + case MLX5_CMD_OP_DESTROY_VPORT_LAG: case MLX5_CMD_OP_DESTROY_TIR: case MLX5_CMD_OP_DESTROY_SQ: case MLX5_CMD_OP_DESTROY_RQ: @@ -376,6 +378,10 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + case MLX5_CMD_OP_CREATE_LAG: + case MLX5_CMD_OP_MODIFY_LAG: + case MLX5_CMD_OP_QUERY_LAG: + case MLX5_CMD_OP_CREATE_VPORT_LAG: case MLX5_CMD_OP_CREATE_TIR: case MLX5_CMD_OP_MODIFY_TIR: case MLX5_CMD_OP_QUERY_TIR: @@ -514,6 +520,12 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(DELETE_L2_TABLE_ENTRY); MLX5_COMMAND_STR_CASE(SET_WOL_ROL); MLX5_COMMAND_STR_CASE(QUERY_WOL_ROL); + MLX5_COMMAND_STR_CASE(CREATE_LAG); + MLX5_COMMAND_STR_CASE(MODIFY_LAG); + MLX5_COMMAND_STR_CASE(QUERY_LAG); + MLX5_COMMAND_STR_CASE(DESTROY_LAG); + MLX5_COMMAND_STR_CASE(CREATE_VPORT_LAG); + MLX5_COMMAND_STR_CASE(DESTROY_VPORT_LAG); MLX5_COMMAND_STR_CASE(CREATE_TIR); MLX5_COMMAND_STR_CASE(MODIFY_TIR); MLX5_COMMAND_STR_CASE(DESTROY_TIR); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5f150c849a8f..043d5256b754 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -174,6 +174,12 @@ enum { MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b, MLX5_CMD_OP_SET_WOL_ROL = 0x830, MLX5_CMD_OP_QUERY_WOL_ROL = 0x831, + MLX5_CMD_OP_CREATE_LAG = 0x840, + MLX5_CMD_OP_MODIFY_LAG = 0x841, + MLX5_CMD_OP_QUERY_LAG = 0x842, + MLX5_CMD_OP_DESTROY_LAG = 0x843, + MLX5_CMD_OP_CREATE_VPORT_LAG = 0x844, + MLX5_CMD_OP_DESTROY_VPORT_LAG = 0x845, MLX5_CMD_OP_CREATE_TIR = 0x900, MLX5_CMD_OP_MODIFY_TIR = 0x901, MLX5_CMD_OP_DESTROY_TIR = 0x902, @@ -884,7 +890,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 pad_tx_eth_packet[0x1]; u8 reserved_at_263[0x8]; u8 log_bf_reg_size[0x5]; - u8 reserved_at_270[0x10]; + + u8 reserved_at_270[0xb]; + u8 lag_master[0x1]; + u8 num_lag_ports[0x4]; u8 reserved_at_280[0x10]; u8 max_wqe_sz_sq[0x10]; @@ -1918,7 +1927,7 @@ enum { struct mlx5_ifc_qpc_bits { u8 state[0x4]; - u8 reserved_at_4[0x4]; + u8 lag_tx_port_affinity[0x4]; u8 st[0x8]; u8 reserved_at_10[0x3]; u8 pm_state[0x2]; @@ -2167,7 +2176,11 @@ struct mlx5_ifc_traffic_counter_bits { }; struct mlx5_ifc_tisc_bits { - u8 reserved_at_0[0xc]; + u8 strict_lag_tx_port_affinity[0x1]; + u8 reserved_at_1[0x3]; + u8 lag_tx_port_affinity[0x04]; + + u8 reserved_at_8[0x4]; u8 prio[0x4]; u8 reserved_at_10[0x10]; @@ -4617,7 +4630,9 @@ struct mlx5_ifc_modify_tis_out_bits { struct mlx5_ifc_modify_tis_bitmask_bits { u8 reserved_at_0[0x20]; - u8 reserved_at_20[0x1f]; + u8 reserved_at_20[0x1d]; + u8 lag_tx_port_affinity[0x1]; + u8 strict_lag_tx_port_affinity[0x1]; u8 prio[0x1]; }; @@ -6215,7 +6230,10 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 reserved_at_e0[0x8]; u8 table_miss_id[0x18]; - u8 reserved_at_100[0x100]; + u8 reserved_at_100[0x8]; + u8 lag_master_next_table_id[0x18]; + + u8 reserved_at_120[0x80]; }; struct mlx5_ifc_create_flow_group_out_bits { @@ -7669,7 +7687,8 @@ struct mlx5_ifc_set_flow_table_root_in_bits { }; enum { - MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID = 0x1, + MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID = (1UL << 0), + MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID = (1UL << 15), }; struct mlx5_ifc_modify_flow_table_out_bits { @@ -7708,7 +7727,10 @@ struct mlx5_ifc_modify_flow_table_in_bits { u8 reserved_at_e0[0x8]; u8 table_miss_id[0x18]; - u8 reserved_at_100[0x100]; + u8 reserved_at_100[0x8]; + u8 lag_master_next_table_id[0x18]; + + u8 reserved_at_120[0x80]; }; struct mlx5_ifc_ets_tcn_config_reg_bits { @@ -7816,4 +7838,134 @@ struct mlx5_ifc_dcbx_param_bits { u8 error[0x8]; u8 reserved_at_a0[0x160]; }; + +struct mlx5_ifc_lagc_bits { + u8 reserved_at_0[0x1d]; + u8 lag_state[0x3]; + + u8 reserved_at_20[0x14]; + u8 tx_remap_affinity_2[0x4]; + u8 reserved_at_38[0x4]; + u8 tx_remap_affinity_1[0x4]; +}; + +struct mlx5_ifc_create_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_modify_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + u8 field_select[0x20]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_query_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_query_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_vport_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_vport_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_vport_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_vport_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 7907f23adc186700efbe56c032527e47485c86ab Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Sun, 17 Apr 2016 16:57:32 +0300 Subject: net/mlx5: Implement RoCE LAG feature Available on dual port cards only, this feature keeps track, using netdev LAG events, of the bonding and link status of each port's PF netdev. When both of the card's PF netdevs are enslaved to the same bond/team master, and only them, LAG state is active. During LAG, only one IB device is present for both ports. In addition to the above, this commit includes FW commands used for managing the LAG, new facilities for adding and removing a single device by interface, and port remap functionality according to bond events. Please note that this feature is currently used only for mimicking Ethernet bonding for RoCE - netdevs functionality is not altered, and their bonding continues to be managed solely by bond/team driver. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 + drivers/net/ethernet/mellanox/mlx5/core/lag.c | 530 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 51 +- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 12 + include/linux/mlx5/driver.h | 4 + 6 files changed, 588 insertions(+), 14 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lag.c (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 05cc1effc13c..dad326ccd4dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_MLX5_CORE) += mlx5_core.o mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \ mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \ - fs_counters.o rl.o + fs_counters.o rl.o lag.o mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o eswitch_offloads.o \ en_main.o en_common.o en_fs.o en_ethtool.o en_tx.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9e36c1538904..219804fa7c74 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3369,6 +3369,8 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) struct mlx5_eswitch *esw = mdev->priv.eswitch; struct mlx5_eswitch_rep rep; + mlx5_lag_add(mdev, netdev); + if (mlx5e_vxlan_allowed(mdev)) { rtnl_lock(); udp_tunnel_get_rx_info(netdev); @@ -3391,6 +3393,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) { queue_work(priv->wq, &priv->set_rx_mode_work); mlx5e_disable_async_events(priv); + mlx5_lag_remove(priv->mdev); } static const struct mlx5e_profile mlx5e_nic_profile = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c new file mode 100644 index 000000000000..3bf0a7ffe0e0 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" + +enum { + MLX5_LAG_FLAG_BONDED = 1 << 0, +}; + +struct lag_func { + struct mlx5_core_dev *dev; + struct net_device *netdev; +}; + +/* Used for collection of netdev event info. */ +struct lag_tracker { + enum netdev_lag_tx_type tx_type; + struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS]; + bool is_bonded; +}; + +/* LAG data of a ConnectX card. + * It serves both its phys functions. + */ +struct mlx5_lag { + u8 flags; + u8 v2p_map[MLX5_MAX_PORTS]; + struct lag_func pf[MLX5_MAX_PORTS]; + struct lag_tracker tracker; + struct delayed_work bond_work; + struct notifier_block nb; +}; + +/* General purpose, use for short periods of time. + * Beware of lock dependencies (preferably, no locks should be acquired + * under it). + */ +static DEFINE_MUTEX(lag_mutex); + +static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1, + u8 remap_port2) +{ + u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_lag_out)] = {0}; + void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx); + + MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG); + + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1, + u8 remap_port2) +{ + u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_lag_out)] = {0}; + void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); + + MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); + MLX5_SET(modify_lag_in, in, field_select, 0x1); + + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_cmd_destroy_lag(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_lag_out)] = {0}; + + MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev) +{ + return dev->priv.lag; +} + +static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, + struct net_device *ndev) +{ + int i; + + for (i = 0; i < MLX5_MAX_PORTS; i++) + if (ldev->pf[i].netdev == ndev) + return i; + + return -1; +} + +static bool mlx5_lag_is_bonded(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_FLAG_BONDED); +} + +static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, + u8 *port1, u8 *port2) +{ + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + if (tracker->netdev_state[0].tx_enabled) { + *port1 = 1; + *port2 = 1; + } else { + *port1 = 2; + *port2 = 2; + } + } else { + *port1 = 1; + *port2 = 2; + if (!tracker->netdev_state[0].link_up) + *port1 = 2; + else if (!tracker->netdev_state[1].link_up) + *port2 = 1; + } +} + +static void mlx5_activate_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker) +{ + struct mlx5_core_dev *dev0 = ldev->pf[0].dev; + int err; + + ldev->flags |= MLX5_LAG_FLAG_BONDED; + + mlx5_infer_tx_affinity_mapping(tracker, &ldev->v2p_map[0], + &ldev->v2p_map[1]); + + err = mlx5_cmd_create_lag(dev0, ldev->v2p_map[0], ldev->v2p_map[1]); + if (err) + mlx5_core_err(dev0, + "Failed to create LAG (%d)\n", + err); +} + +static void mlx5_deactivate_lag(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev0 = ldev->pf[0].dev; + int err; + + ldev->flags &= ~MLX5_LAG_FLAG_BONDED; + + err = mlx5_cmd_destroy_lag(dev0); + if (err) + mlx5_core_err(dev0, + "Failed to destroy LAG (%d)\n", + err); +} + +static void mlx5_do_bond(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev0 = ldev->pf[0].dev; + struct mlx5_core_dev *dev1 = ldev->pf[1].dev; + struct lag_tracker tracker; + u8 v2p_port1, v2p_port2; + int i, err; + + if (!dev0 || !dev1) + return; + + mutex_lock(&lag_mutex); + tracker = ldev->tracker; + mutex_unlock(&lag_mutex); + + if (tracker.is_bonded && !mlx5_lag_is_bonded(ldev)) { + for (i = 0; i < MLX5_MAX_PORTS; i++) + mlx5_remove_dev_by_protocol(ldev->pf[i].dev, + MLX5_INTERFACE_PROTOCOL_IB); + + mlx5_activate_lag(ldev, &tracker); + + mlx5_add_dev_by_protocol(dev0, MLX5_INTERFACE_PROTOCOL_IB); + mlx5_nic_vport_enable_roce(dev1); + } else if (tracker.is_bonded && mlx5_lag_is_bonded(ldev)) { + mlx5_infer_tx_affinity_mapping(&tracker, &v2p_port1, + &v2p_port2); + + if ((v2p_port1 != ldev->v2p_map[0]) || + (v2p_port2 != ldev->v2p_map[1])) { + ldev->v2p_map[0] = v2p_port1; + ldev->v2p_map[1] = v2p_port2; + + err = mlx5_cmd_modify_lag(dev0, v2p_port1, v2p_port2); + if (err) + mlx5_core_err(dev0, + "Failed to modify LAG (%d)\n", + err); + } + } else if (!tracker.is_bonded && mlx5_lag_is_bonded(ldev)) { + mlx5_remove_dev_by_protocol(dev0, MLX5_INTERFACE_PROTOCOL_IB); + mlx5_nic_vport_disable_roce(dev1); + + mlx5_deactivate_lag(ldev); + + for (i = 0; i < MLX5_MAX_PORTS; i++) + if (ldev->pf[i].dev) + mlx5_add_dev_by_protocol(ldev->pf[i].dev, + MLX5_INTERFACE_PROTOCOL_IB); + } +} + +static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay) +{ + schedule_delayed_work(&ldev->bond_work, delay); +} + +static void mlx5_do_bond_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag, + bond_work); + int status; + + status = mutex_trylock(&mlx5_intf_mutex); + if (!status) { + /* 1 sec delay. */ + mlx5_queue_bond_work(ldev, HZ); + return; + } + + mlx5_do_bond(ldev); + mutex_unlock(&mlx5_intf_mutex); +} + +static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *ndev, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *upper = info->upper_dev, *ndev_tmp; + struct netdev_lag_upper_info *lag_upper_info; + bool is_bonded; + int bond_status = 0; + int num_slaves = 0; + int idx; + + if (!netif_is_lag_master(upper)) + return 0; + + lag_upper_info = info->upper_info; + + /* The event may still be of interest if the slave does not belong to + * us, but is enslaved to a master which has one or more of our netdevs + * as slaves (e.g., if a new slave is added to a master that bonds two + * of our netdevs, we should unbond). + */ + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper, ndev_tmp) { + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); + if (idx > -1) + bond_status |= (1 << idx); + + num_slaves++; + } + rcu_read_unlock(); + + /* None of this lagdev's netdevs are slaves of this master. */ + if (!(bond_status & 0x3)) + return 0; + + if (lag_upper_info) + tracker->tx_type = lag_upper_info->tx_type; + + /* Determine bonding status: + * A device is considered bonded if both its physical ports are slaves + * of the same lag master, and only them. + * Lag mode must be activebackup or hash. + */ + is_bonded = (num_slaves == MLX5_MAX_PORTS) && + (bond_status == 0x3) && + ((tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) || + (tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)); + + if (tracker->is_bonded != is_bonded) { + tracker->is_bonded = is_bonded; + return 1; + } + + return 0; +} + +static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *ndev, + struct netdev_notifier_changelowerstate_info *info) +{ + struct netdev_lag_lower_state_info *lag_lower_info; + int idx; + + if (!netif_is_lag_port(ndev)) + return 0; + + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev); + if (idx == -1) + return 0; + + /* This information is used to determine virtual to physical + * port mapping. + */ + lag_lower_info = info->lower_state_info; + if (!lag_lower_info) + return 0; + + tracker->netdev_state[idx] = *lag_lower_info; + + return 1; +} + +static int mlx5_lag_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct lag_tracker tracker; + struct mlx5_lag *ldev; + int changed = 0; + + if (!net_eq(dev_net(ndev), &init_net)) + return NOTIFY_DONE; + + if ((event != NETDEV_CHANGEUPPER) && (event != NETDEV_CHANGELOWERSTATE)) + return NOTIFY_DONE; + + ldev = container_of(this, struct mlx5_lag, nb); + tracker = ldev->tracker; + + switch (event) { + case NETDEV_CHANGEUPPER: + changed = mlx5_handle_changeupper_event(ldev, &tracker, ndev, + ptr); + break; + case NETDEV_CHANGELOWERSTATE: + changed = mlx5_handle_changelowerstate_event(ldev, &tracker, + ndev, ptr); + break; + } + + mutex_lock(&lag_mutex); + ldev->tracker = tracker; + mutex_unlock(&lag_mutex); + + if (changed) + mlx5_queue_bond_work(ldev, 0); + + return NOTIFY_DONE; +} + +static struct mlx5_lag *mlx5_lag_dev_alloc(void) +{ + struct mlx5_lag *ldev; + + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) + return NULL; + + INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); + + return ldev; +} + +static void mlx5_lag_dev_free(struct mlx5_lag *ldev) +{ + kfree(ldev); +} + +static void mlx5_lag_dev_add_pf(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev, + struct net_device *netdev) +{ + unsigned int fn = PCI_FUNC(dev->pdev->devfn); + + if (fn >= MLX5_MAX_PORTS) + return; + + mutex_lock(&lag_mutex); + ldev->pf[fn].dev = dev; + ldev->pf[fn].netdev = netdev; + ldev->tracker.netdev_state[fn].link_up = 0; + ldev->tracker.netdev_state[fn].tx_enabled = 0; + + dev->priv.lag = ldev; + mutex_unlock(&lag_mutex); +} + +static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev) +{ + int i; + + for (i = 0; i < MLX5_MAX_PORTS; i++) + if (ldev->pf[i].dev == dev) + break; + + if (i == MLX5_MAX_PORTS) + return; + + mutex_lock(&lag_mutex); + memset(&ldev->pf[i], 0, sizeof(*ldev->pf)); + + dev->priv.lag = NULL; + mutex_unlock(&lag_mutex); +} + +static u16 mlx5_gen_pci_id(struct mlx5_core_dev *dev) +{ + return (u16)((dev->pdev->bus->number << 8) | + PCI_SLOT(dev->pdev->devfn)); +} + +/* Must be called with intf_mutex held */ +void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) +{ + struct mlx5_lag *ldev = NULL; + struct mlx5_core_dev *tmp_dev; + struct mlx5_priv *priv; + u16 pci_id; + + if (!MLX5_CAP_GEN(dev, vport_group_manager) || + !MLX5_CAP_GEN(dev, lag_master) || + (MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS)) + return; + + pci_id = mlx5_gen_pci_id(dev); + + mlx5_core_for_each_priv(priv) { + tmp_dev = container_of(priv, struct mlx5_core_dev, priv); + if ((dev != tmp_dev) && + (mlx5_gen_pci_id(tmp_dev) == pci_id)) { + ldev = tmp_dev->priv.lag; + break; + } + } + + if (!ldev) { + ldev = mlx5_lag_dev_alloc(); + if (!ldev) { + mlx5_core_err(dev, "Failed to alloc lag dev\n"); + return; + } + } + + mlx5_lag_dev_add_pf(ldev, dev, netdev); + + if (!ldev->nb.notifier_call) { + ldev->nb.notifier_call = mlx5_lag_netdev_event; + if (register_netdevice_notifier(&ldev->nb)) { + ldev->nb.notifier_call = NULL; + mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); + } + } +} + +/* Must be called with intf_mutex held */ +void mlx5_lag_remove(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + int i; + + ldev = mlx5_lag_dev_get(dev); + if (!ldev) + return; + + if (mlx5_lag_is_bonded(ldev)) + mlx5_deactivate_lag(ldev); + + mlx5_lag_dev_remove_pf(ldev, dev); + + for (i = 0; i < MLX5_MAX_PORTS; i++) + if (ldev->pf[i].dev) + break; + + if (i == MLX5_MAX_PORTS) { + if (ldev->nb.notifier_call) + unregister_netdevice_notifier(&ldev->nb); + cancel_delayed_work_sync(&ldev->bond_work); + mlx5_lag_dev_free(ldev); + } +} + +bool mlx5_lag_is_active(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + bool res; + + mutex_lock(&lag_mutex); + ldev = mlx5_lag_dev_get(dev); + res = ldev && mlx5_lag_is_bonded(ldev); + mutex_unlock(&lag_mutex); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_active); + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index d6446a3623b2..db7bcfcef3da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -73,8 +73,9 @@ module_param_named(prof_sel, prof_sel, int, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); static LIST_HEAD(intf_list); -static LIST_HEAD(dev_list); -static DEFINE_MUTEX(intf_mutex); + +LIST_HEAD(mlx5_dev_list); +DEFINE_MUTEX(mlx5_intf_mutex); struct mlx5_device_context { struct list_head list; @@ -820,11 +821,11 @@ static int mlx5_register_device(struct mlx5_core_dev *dev) struct mlx5_priv *priv = &dev->priv; struct mlx5_interface *intf; - mutex_lock(&intf_mutex); - list_add_tail(&priv->dev_list, &dev_list); + mutex_lock(&mlx5_intf_mutex); + list_add_tail(&priv->dev_list, &mlx5_dev_list); list_for_each_entry(intf, &intf_list, list) mlx5_add_device(intf, priv); - mutex_unlock(&intf_mutex); + mutex_unlock(&mlx5_intf_mutex); return 0; } @@ -834,11 +835,11 @@ static void mlx5_unregister_device(struct mlx5_core_dev *dev) struct mlx5_priv *priv = &dev->priv; struct mlx5_interface *intf; - mutex_lock(&intf_mutex); + mutex_lock(&mlx5_intf_mutex); list_for_each_entry(intf, &intf_list, list) mlx5_remove_device(intf, priv); list_del(&priv->dev_list); - mutex_unlock(&intf_mutex); + mutex_unlock(&mlx5_intf_mutex); } int mlx5_register_interface(struct mlx5_interface *intf) @@ -848,11 +849,11 @@ int mlx5_register_interface(struct mlx5_interface *intf) if (!intf->add || !intf->remove) return -EINVAL; - mutex_lock(&intf_mutex); + mutex_lock(&mlx5_intf_mutex); list_add_tail(&intf->list, &intf_list); - list_for_each_entry(priv, &dev_list, dev_list) + list_for_each_entry(priv, &mlx5_dev_list, dev_list) mlx5_add_device(intf, priv); - mutex_unlock(&intf_mutex); + mutex_unlock(&mlx5_intf_mutex); return 0; } @@ -862,11 +863,11 @@ void mlx5_unregister_interface(struct mlx5_interface *intf) { struct mlx5_priv *priv; - mutex_lock(&intf_mutex); - list_for_each_entry(priv, &dev_list, dev_list) + mutex_lock(&mlx5_intf_mutex); + list_for_each_entry(priv, &mlx5_dev_list, dev_list) mlx5_remove_device(intf, priv); list_del(&intf->list); - mutex_unlock(&intf_mutex); + mutex_unlock(&mlx5_intf_mutex); } EXPORT_SYMBOL(mlx5_unregister_interface); @@ -892,6 +893,30 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol) } EXPORT_SYMBOL(mlx5_get_protocol_dev); +/* Must be called with intf_mutex held */ +void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol) +{ + struct mlx5_interface *intf; + + list_for_each_entry(intf, &intf_list, list) + if (intf->protocol == protocol) { + mlx5_add_device(intf, &dev->priv); + break; + } +} + +/* Must be called with intf_mutex held */ +void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol) +{ + struct mlx5_interface *intf; + + list_for_each_entry(intf, &intf_list, list) + if (intf->protocol == protocol) { + mlx5_remove_device(intf, &dev->priv); + break; + } +} + static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv) { struct pci_dev *pdev = dev->pdev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index ef0eeedcceb6..173a6b2e358a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -46,6 +46,9 @@ extern int mlx5_core_debug_mask; +extern struct list_head mlx5_dev_list; +extern struct mutex mlx5_intf_mutex; + #define mlx5_core_dbg(__dev, format, ...) \ dev_dbg(&(__dev)->pdev->dev, "%s:%s:%d:(pid %d): " format, \ (__dev)->priv.name, __func__, __LINE__, current->pid, \ @@ -70,6 +73,9 @@ do { \ #define mlx5_core_info(__dev, format, ...) \ dev_info(&(__dev)->pdev->dev, format, ##__VA_ARGS__) +#define mlx5_core_for_each_priv(__priv) \ + list_for_each_entry(__priv, &mlx5_dev_list, dev_list) + enum { MLX5_CMD_DATA, /* print command payload only */ MLX5_CMD_TIME, /* print command execution time */ @@ -92,6 +98,12 @@ u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx); struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn); void mlx5_cq_tasklet_cb(unsigned long data); +void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev); +void mlx5_lag_remove(struct mlx5_core_dev *dev); + +void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol); +void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol); + void mlx5e_init(void); void mlx5e_cleanup(void); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0ea78b5edbb2..ed983b8c3213 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -477,6 +477,7 @@ struct mlx5_fc_stats { }; struct mlx5_eswitch; +struct mlx5_lag; struct mlx5_rl_entry { u32 rate; @@ -550,6 +551,7 @@ struct mlx5_priv { struct mlx5_flow_steering *steering; struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; + struct mlx5_lag *lag; unsigned long pci_dev_data; struct mlx5_fc_stats fc_stats; struct mlx5_rl_table rl_table; @@ -942,6 +944,8 @@ int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); +bool mlx5_lag_is_active(struct mlx5_core_dev *dev); + struct mlx5_profile { u64 mask; u8 log_max_qp; -- cgit v1.2.3 From 6a32047a441b870dd2570fe0831dada5e9ce40f6 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 11:06:44 +0000 Subject: net/mlx5: Get RoCE netdev Used by IB driver for determining the IB bond device's netdev, when LAG is active. Returns PF0's netdev if mode is not active-backup, or the PF netdev of the active slave when mode is active-backup. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 27 +++++++++++++++++++++++++++ include/linux/mlx5/driver.h | 1 + 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index 3bf0a7ffe0e0..952305054e16 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -528,3 +528,30 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev) } EXPORT_SYMBOL(mlx5_lag_is_active); +struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) +{ + struct net_device *ndev = NULL; + struct mlx5_lag *ldev; + + mutex_lock(&lag_mutex); + ldev = mlx5_lag_dev_get(dev); + + if (!(ldev && mlx5_lag_is_bonded(ldev))) + goto unlock; + + if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + ndev = ldev->tracker.netdev_state[0].tx_enabled ? + ldev->pf[0].netdev : ldev->pf[1].netdev; + } else { + ndev = ldev->pf[0].netdev; + } + if (ndev) + dev_hold(ndev); + +unlock: + mutex_unlock(&lag_mutex); + + return ndev; +} +EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); + diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ed983b8c3213..c568dd927330 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -945,6 +945,7 @@ void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); struct mlx5_profile { u64 mask; -- cgit v1.2.3 From aaff1bea16bb7f259a263c3ae4633d092e2da799 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 09:57:05 +0000 Subject: net/mlx5: LAG demux flow table support Add interfaces to allow the creation and destruction of a LAG demux flow table. It is a special flow table used during LAG for redirecting non user-mode packets from PF0 to PF1 root ft, if a packet was received on phys port two. Signed-off-by: Aviv Heller Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 56 +++++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 31 +++++++++---- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 6 +++ include/linux/mlx5/fs.h | 3 ++ 5 files changed, 75 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 7aaefa9aaf1c..7a0415e6d339 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -58,6 +58,7 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev, int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, u16 vport, + enum fs_flow_table_op_mod op_mod, enum fs_flow_table_type type, unsigned int level, unsigned int log_size, struct mlx5_flow_table *next_ft, unsigned int *table_id) @@ -69,10 +70,6 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); - if (next_ft) { - MLX5_SET(create_flow_table_in, in, table_miss_mode, 1); - MLX5_SET(create_flow_table_in, in, table_miss_id, next_ft->id); - } MLX5_SET(create_flow_table_in, in, table_type, type); MLX5_SET(create_flow_table_in, in, level, level); MLX5_SET(create_flow_table_in, in, log_size, log_size); @@ -81,6 +78,22 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, MLX5_SET(create_flow_table_in, in, other_vport, 1); } + switch (op_mod) { + case FS_FT_OP_MOD_NORMAL: + if (next_ft) { + MLX5_SET(create_flow_table_in, in, table_miss_mode, 1); + MLX5_SET(create_flow_table_in, in, table_miss_id, next_ft->id); + } + break; + + case FS_FT_OP_MOD_LAG_DEMUX: + MLX5_SET(create_flow_table_in, in, op_mod, 0x1); + if (next_ft) + MLX5_SET(create_flow_table_in, in, lag_master_next_table_id, + next_ft->id); + break; + } + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); if (!err) *table_id = MLX5_GET(create_flow_table_out, out, @@ -117,17 +130,32 @@ int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev, MLX5_CMD_OP_MODIFY_FLOW_TABLE); MLX5_SET(modify_flow_table_in, in, table_type, ft->type); MLX5_SET(modify_flow_table_in, in, table_id, ft->id); - if (ft->vport) { - MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport); - MLX5_SET(modify_flow_table_in, in, other_vport, 1); - } - MLX5_SET(modify_flow_table_in, in, modify_field_select, - MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID); - if (next_ft) { - MLX5_SET(modify_flow_table_in, in, table_miss_mode, 1); - MLX5_SET(modify_flow_table_in, in, table_miss_id, next_ft->id); + + if (ft->op_mod == FS_FT_OP_MOD_LAG_DEMUX) { + MLX5_SET(modify_flow_table_in, in, modify_field_select, + MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID); + if (next_ft) { + MLX5_SET(modify_flow_table_in, in, + lag_master_next_table_id, next_ft->id); + } else { + MLX5_SET(modify_flow_table_in, in, + lag_master_next_table_id, 0); + } } else { - MLX5_SET(modify_flow_table_in, in, table_miss_mode, 0); + if (ft->vport) { + MLX5_SET(modify_flow_table_in, in, vport_number, + ft->vport); + MLX5_SET(modify_flow_table_in, in, other_vport, 1); + } + MLX5_SET(modify_flow_table_in, in, modify_field_select, + MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID); + if (next_ft) { + MLX5_SET(modify_flow_table_in, in, table_miss_mode, 1); + MLX5_SET(modify_flow_table_in, in, table_miss_id, + next_ft->id); + } else { + MLX5_SET(modify_flow_table_in, in, table_miss_mode, 0); + } } return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h index ac52fdfb5096..c5bc4686c832 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h @@ -35,6 +35,7 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev, u16 vport, + enum fs_flow_table_op_mod op_mod, enum fs_flow_table_type type, unsigned int level, unsigned int log_size, struct mlx5_flow_table *next_ft, unsigned int *table_id); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 243efc5a8bd1..eabd73482c86 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -477,7 +477,8 @@ static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in) } static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_fte, - enum fs_flow_table_type table_type) + enum fs_flow_table_type table_type, + enum fs_flow_table_op_mod op_mod) { struct mlx5_flow_table *ft; @@ -487,6 +488,7 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft ft->level = level; ft->node.type = FS_TYPE_FLOW_TABLE; + ft->op_mod = op_mod; ft->type = table_type; ft->vport = vport; ft->max_fte = max_fte; @@ -724,6 +726,7 @@ static void list_add_flow_table(struct mlx5_flow_table *ft, } static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns, + enum fs_flow_table_op_mod op_mod, u16 vport, int prio, int max_fte, u32 level) { @@ -756,18 +759,19 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa level += fs_prio->start_level; ft = alloc_flow_table(level, vport, - roundup_pow_of_two(max_fte), - root->table_type); + max_fte ? roundup_pow_of_two(max_fte) : 0, + root->table_type, + op_mod); if (!ft) { err = -ENOMEM; goto unlock_root; } tree_init_node(&ft->node, 1, del_flow_table); - log_table_sz = ilog2(ft->max_fte); + log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0; next_ft = find_next_chained_ft(fs_prio); - err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->type, ft->level, - log_table_sz, next_ft, &ft->id); + err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type, + ft->level, log_table_sz, next_ft, &ft->id); if (err) goto free_ft; @@ -794,16 +798,27 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns, int prio, int max_fte, u32 level) { - return __mlx5_create_flow_table(ns, 0, prio, max_fte, level); + return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_NORMAL, 0, prio, + max_fte, level); } struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, int prio, int max_fte, u32 level, u16 vport) { - return __mlx5_create_flow_table(ns, vport, prio, max_fte, level); + return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_NORMAL, vport, prio, + max_fte, level); } +struct mlx5_flow_table *mlx5_create_lag_demux_flow_table( + struct mlx5_flow_namespace *ns, + int prio, u32 level) +{ + return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_LAG_DEMUX, 0, prio, 0, + level); +} +EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table); + struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, int prio, int num_flow_table_entries, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 9cffb6aeb4e9..23e46e35413f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -51,6 +51,11 @@ enum fs_flow_table_type { FS_FT_FDB = 0X4, }; +enum fs_flow_table_op_mod { + FS_FT_OP_MOD_NORMAL, + FS_FT_OP_MOD_LAG_DEMUX, +}; + enum fs_fte_status { FS_FTE_STATUS_EXISTING = 1UL << 0, }; @@ -93,6 +98,7 @@ struct mlx5_flow_table { unsigned int max_fte; unsigned int level; enum fs_flow_table_type type; + enum fs_flow_table_op_mod op_mod; struct { bool active; unsigned int required_groups; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e036d6030867..7edfe0b8f1ec 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -106,6 +106,9 @@ mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, int prio, int num_flow_table_entries, u32 level, u16 vport); +struct mlx5_flow_table *mlx5_create_lag_demux_flow_table( + struct mlx5_flow_namespace *ns, + int prio, u32 level); int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); /* inbox should be set with the following values: -- cgit v1.2.3 From 3e75d4ebaae7aac5ba82fc7a6e0e6fb56dac1916 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 10:02:29 +0000 Subject: net/mlx5: Add LAG flow steering namespace This namespace is used for LAG demux flowtable. The idea is to position the LAG demux ft between bypass and kernel flowtables, allowing raw-eth traffic from both ports to be received by the PF0 IB device. Signed-off-by: Aviv Heller Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 11 ++++++++++- include/linux/mlx5/fs.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index eabd73482c86..ac414b7f366e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -96,6 +96,10 @@ #define OFFLOADS_NUM_PRIOS 1 #define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + 1) +#define LAG_PRIO_NUM_LEVELS 1 +#define LAG_NUM_PRIOS 1 +#define LAG_MIN_LEVEL (OFFLOADS_MIN_LEVEL + 1) + struct node_caps { size_t arr_sz; long *caps; @@ -111,12 +115,16 @@ static struct init_tree_node { int num_levels; } root_fs = { .type = FS_TYPE_NAMESPACE, - .ar_size = 6, + .ar_size = 7, .children = (struct init_tree_node[]) { ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_CHAINING_CAPS, ADD_NS(ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS, BY_PASS_PRIO_NUM_LEVELS))), + ADD_PRIO(0, LAG_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(ADD_MULTIPLE_PRIO(LAG_NUM_PRIOS, + LAG_PRIO_NUM_LEVELS))), ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, {}, ADD_NS(ADD_MULTIPLE_PRIO(OFFLOADS_NUM_PRIOS, OFFLOADS_MAX_FT))), ADD_PRIO(0, ETHTOOL_MIN_LEVEL, 0, @@ -1396,6 +1404,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, switch (type) { case MLX5_FLOW_NAMESPACE_BYPASS: + case MLX5_FLOW_NAMESPACE_LAG: case MLX5_FLOW_NAMESPACE_OFFLOADS: case MLX5_FLOW_NAMESPACE_ETHTOOL: case MLX5_FLOW_NAMESPACE_KERNEL: diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 7edfe0b8f1ec..8803212fc3aa 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -54,6 +54,7 @@ static inline void build_leftovers_ft_param(int *priority, enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, + MLX5_FLOW_NAMESPACE_LAG, MLX5_FLOW_NAMESPACE_OFFLOADS, MLX5_FLOW_NAMESPACE_ETHTOOL, MLX5_FLOW_NAMESPACE_KERNEL, -- cgit v1.2.3 From 3bc34f3bcb087764796d9a6eaa476e270114eb8f Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 10:38:42 +0000 Subject: net/mlx5: Vport LAG creation support Add interfaces for issuing CREATE_VPORT_LAG and DESTROY_VPORT_LAG commands. Used for receiving PF1's eth traffic on PF0's root ft. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 22 ++++++++++++++++++++++ include/linux/mlx5/driver.h | 2 ++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index 3cb570ac2b80..5fe320c5a60e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -110,6 +110,28 @@ static int mlx5_cmd_destroy_lag(struct mlx5_core_dev *dev) return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } +int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_vport_lag_out)] = {0}; + + MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} +EXPORT_SYMBOL(mlx5_cmd_create_vport_lag); + +int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_vport_lag_out)] = {0}; + + MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} +EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag); + static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev) { return dev->priv.lag; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c568dd927330..5cb9fa7aec61 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -944,6 +944,8 @@ int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); +int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); +int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); -- cgit v1.2.3 From cea824d416522ce63d83b45fc0dc53c0f5b68cee Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 31 May 2016 14:09:09 +0300 Subject: net/mlx5: Introduce sniffer steering hardware capabilities Define needed hardware capabilities for sniffer RX and TX flow tables. Add the following capabilities: 1. Sniffer RX flow table capabilities. 2. Sniffer TX flow table capabilities. 3. If same TIR can be used by multiple flow tables of different types. Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 12 ++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2575070c836e..77c141797152 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -964,6 +964,18 @@ enum mlx5_cap_type { #define MLX5_CAP_FLOWTABLE_NIC_RX_MAX(mdev, cap) \ MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive.cap) +#define MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) \ + MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive_sniffer.cap) + +#define MLX5_CAP_FLOWTABLE_SNIFFER_RX_MAX(mdev, cap) \ + MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive_sniffer.cap) + +#define MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) \ + MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_transmit_sniffer.cap) + +#define MLX5_CAP_FLOWTABLE_SNIFFER_TX_MAX(mdev, cap) \ + MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit_sniffer.cap) + #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 043d5256b754..73a720f74a69 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -483,7 +483,9 @@ struct mlx5_ifc_ads_bits { struct mlx5_ifc_flow_table_nic_cap_bits { u8 nic_rx_multi_path_tirs[0x1]; - u8 reserved_at_1[0x1ff]; + u8 nic_rx_multi_path_tirs_fts[0x1]; + u8 allow_sniffer_and_nic_rx_shared_tir[0x1]; + u8 reserved_at_3[0x1fd]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; -- cgit v1.2.3 From 87d22483ce68e609818d61e3a65361f5634c6cd6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 6 Jun 2016 18:09:35 +0300 Subject: net/mlx5: Add sniffer namespaces Add sniffer TX and RX namespaces to receive ingoing and outgoing traffic. Each outgoing/incoming packet is duplicated and steered to the sniffer TX/RX namespace in addition to the regular flow. Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 58 +++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 4 ++ include/linux/mlx5/fs.h | 2 + 3 files changed, 64 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index ac414b7f366e..f5571a5a57c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1427,6 +1427,16 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, return &steering->esw_ingress_root_ns->ns; else return NULL; + case MLX5_FLOW_NAMESPACE_SNIFFER_RX: + if (steering->sniffer_rx_root_ns) + return &steering->sniffer_rx_root_ns->ns; + else + return NULL; + case MLX5_FLOW_NAMESPACE_SNIFFER_TX: + if (steering->sniffer_tx_root_ns) + return &steering->sniffer_tx_root_ns->ns; + else + return NULL; default: return NULL; } @@ -1726,10 +1736,46 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev) cleanup_root_ns(steering->esw_egress_root_ns); cleanup_root_ns(steering->esw_ingress_root_ns); cleanup_root_ns(steering->fdb_root_ns); + cleanup_root_ns(steering->sniffer_rx_root_ns); + cleanup_root_ns(steering->sniffer_tx_root_ns); mlx5_cleanup_fc_stats(dev); kfree(steering); } +static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering) +{ + struct fs_prio *prio; + + steering->sniffer_tx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_TX); + if (!steering->sniffer_tx_root_ns) + return -ENOMEM; + + /* Create single prio */ + prio = fs_create_prio(&steering->sniffer_tx_root_ns->ns, 0, 1); + if (IS_ERR(prio)) { + cleanup_root_ns(steering->sniffer_tx_root_ns); + return PTR_ERR(prio); + } + return 0; +} + +static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering) +{ + struct fs_prio *prio; + + steering->sniffer_rx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_RX); + if (!steering->sniffer_rx_root_ns) + return -ENOMEM; + + /* Create single prio */ + prio = fs_create_prio(&steering->sniffer_rx_root_ns->ns, 0, 1); + if (IS_ERR(prio)) { + cleanup_root_ns(steering->sniffer_rx_root_ns); + return PTR_ERR(prio); + } + return 0; +} + static int init_fdb_root_ns(struct mlx5_flow_steering *steering) { struct fs_prio *prio; @@ -1826,6 +1872,18 @@ int mlx5_init_fs(struct mlx5_core_dev *dev) } } + if (MLX5_CAP_FLOWTABLE_SNIFFER_RX(dev, ft_support)) { + err = init_sniffer_rx_root_ns(steering); + if (err) + goto err; + } + + if (MLX5_CAP_FLOWTABLE_SNIFFER_TX(dev, ft_support)) { + err = init_sniffer_tx_root_ns(steering); + if (err) + goto err; + } + return 0; err: mlx5_cleanup_fs(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 23e46e35413f..71ff03bceabb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -49,6 +49,8 @@ enum fs_flow_table_type { FS_FT_ESW_EGRESS_ACL = 0x2, FS_FT_ESW_INGRESS_ACL = 0x3, FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, }; enum fs_flow_table_op_mod { @@ -66,6 +68,8 @@ struct mlx5_flow_steering { struct mlx5_flow_root_namespace *fdb_root_ns; struct mlx5_flow_root_namespace *esw_egress_root_ns; struct mlx5_flow_root_namespace *esw_ingress_root_ns; + struct mlx5_flow_root_namespace *sniffer_tx_root_ns; + struct mlx5_flow_root_namespace *sniffer_rx_root_ns; }; struct fs_node { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8803212fc3aa..93ebc5e21334 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -63,6 +63,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_FDB, MLX5_FLOW_NAMESPACE_ESW_EGRESS, MLX5_FLOW_NAMESPACE_ESW_INGRESS, + MLX5_FLOW_NAMESPACE_SNIFFER_RX, + MLX5_FLOW_NAMESPACE_SNIFFER_TX, }; struct mlx5_flow_table; -- cgit v1.2.3 From d194fd265e78ca1b2a4607918778446de44818b2 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Fri, 19 Aug 2016 08:34:57 +0300 Subject: qed*: Fix pause setting When moving into using ethtool's link_ksetting, qed started supplying its own bitmask of speed/capabilities, but qede is still checking for the SUPPORTED value to determine whether it supports pause. Fixes: 054c67d1c82a ("qed*: Add support for ethtool link_ksettings callbacks") Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 2 +- include/linux/qed/qed_if.h | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 6e17ee10e748..51782bf4987f 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -672,7 +672,7 @@ static int qede_set_pauseparam(struct net_device *dev, memset(¶ms, 0, sizeof(params)); params.override_flags |= QED_LINK_OVERRIDE_PAUSE_CONFIG; if (epause->autoneg) { - if (!(current_link.supported_caps & SUPPORTED_Autoneg)) { + if (!(current_link.supported_caps & QED_LM_Autoneg_BIT)) { DP_INFO(edev, "autoneg not supported\n"); return -EINVAL; } diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 3ed7d20e3811..d8dc5c2243d5 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -318,9 +318,11 @@ struct qed_link_params { struct qed_link_output { bool link_up; - u32 supported_caps; /* In SUPPORTED defs */ - u32 advertised_caps; /* In ADVERTISED defs */ - u32 lp_caps; /* In ADVERTISED defs */ + /* In QED_LM_* defs */ + u32 supported_caps; + u32 advertised_caps; + u32 lp_caps; + u32 speed; /* In Mb/s */ u8 duplex; /* In DUPLEX defs */ u8 port; /* In PORT defs */ -- cgit v1.2.3 From f6a66927692e30bdc1792e7a1fc2107d4dfcf42d Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 17 Aug 2016 13:36:11 +0300 Subject: flow_dissector: Get vlan priority in addition to vlan id Add vlan priority check to the flow dissector by adding new flow dissector struct, flow_dissector_key_vlan which includes vlan tag fields. vlan_id and flow_label fields were under the same struct (flow_dissector_key_tags). It was a convenient setting since struct flow_dissector_key_tags is used by struct flow_keys and by setting vlan_id and flow_label under the same struct, we get precisely 24 or 48 bytes in flow_keys from flow_dissector_key_basic. Now, when adding vlan priority support, the code will be cleaner if flow_label and vlan tag won't be under the same struct anymore. Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 1 + include/net/flow_dissector.h | 12 +++++++++--- net/core/flow_dissector.c | 25 ++++++++++++++++--------- 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index a5f6ce6b578c..49d4aef1f789 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -81,6 +81,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) #define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) +#define skb_vlan_tag_get_prio(__skb) ((__skb)->vlan_tci & VLAN_PRIO_MASK) /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index d3d60dccd19f..f266b512c3bd 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -32,8 +32,13 @@ struct flow_dissector_key_basic { }; struct flow_dissector_key_tags { - u32 vlan_id:12, - flow_label:20; + u32 flow_label; +}; + +struct flow_dissector_key_vlan { + u16 vlan_id:12, + vlan_priority:3; + u16 padding; }; struct flow_dissector_key_keyid { @@ -119,7 +124,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */ - FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */ + FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ @@ -148,6 +153,7 @@ struct flow_keys { #define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic; struct flow_dissector_key_tags tags; + struct flow_dissector_key_vlan vlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_addrs addrs; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 362d693c003f..a2879c0f6c4c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -118,6 +118,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_keyid *key_keyid; bool skip_vlan = false; u8 ip_proto = 0; @@ -266,16 +267,22 @@ ipv6: skip_vlan = true; if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { - key_tags = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID, + FLOW_DISSECTOR_KEY_VLAN)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLAN, target_container); - if (skb_vlan_tag_present(skb)) - key_tags->vlan_id = skb_vlan_tag_get_id(skb); - else - key_tags->vlan_id = ntohs(vlan->h_vlan_TCI) & + if (skb_vlan_tag_present(skb)) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = + (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } } goto again; @@ -935,8 +942,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .offset = offsetof(struct flow_keys, ports), }, { - .key_id = FLOW_DISSECTOR_KEY_VLANID, - .offset = offsetof(struct flow_keys, tags), + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, -- cgit v1.2.3 From 9399ae9a6cb28ebac78216f715ace3b42f1c2132 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 17 Aug 2016 13:36:13 +0300 Subject: net_sched: flower: Add vlan support Enhance flower to support 802.1Q vlan protocol classification. Currently, the supported fields are vlan_id and vlan_priority. Example: # add a flower filter with vlan id and priority classification tc filter add dev ens4f0 protocol 802.1Q parent ffff: \ flower \ indev ens4f0 \ vlan_ethtype ipv4 \ vlan_id 100 \ vlan_prio 3 \ action vlan pop Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 ++ net/sched/cls_flower.c | 70 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index d1c1ccaba787..51b5b247fb5a 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -428,6 +428,9 @@ enum { TCA_FLOWER_KEY_UDP_DST, /* be16 */ TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 0080fc073019..1e11e57e6947 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,6 +28,7 @@ struct fl_flow_key { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_vlan vlan; struct flow_dissector_key_addrs ipaddrs; union { struct flow_dissector_key_ipv4_addrs ipv4; @@ -293,6 +294,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 }, + }; static void fl_set_key_val(struct nlattr **tb, @@ -308,9 +313,29 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static void fl_set_key_vlan(struct nlattr **tb, + struct flow_dissector_key_vlan *key_val, + struct flow_dissector_key_vlan *key_mask) +{ +#define VLAN_PRIORITY_MASK 0x7 + + if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + key_val->vlan_id = + nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + key_mask->vlan_id = VLAN_VID_MASK; + } + if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + key_val->vlan_priority = + nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + VLAN_PRIORITY_MASK; + key_mask->vlan_priority = VLAN_PRIORITY_MASK; + } +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { + __be16 ethertype; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); @@ -328,9 +353,19 @@ static int fl_set_key(struct net *net, struct nlattr **tb, mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, sizeof(key->eth.src)); - fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + if (tb[TCA_FLOWER_KEY_ETH_TYPE]) + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); + + if (ethertype == htons(ETH_P_8021Q)) { + fl_set_key_vlan(tb, &key->vlan, &mask->vlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } if (key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) { @@ -438,6 +473,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_VLAN, vlan); skb_flow_dissector_init(&head->dissector, keys, cnt); } @@ -666,6 +703,29 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_vlan(struct sk_buff *skb, + struct flow_dissector_key_vlan *vlan_key, + struct flow_dissector_key_vlan *vlan_mask) +{ + int err; + + if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) + return 0; + if (vlan_mask->vlan_id) { + err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + vlan_key->vlan_id); + if (err) + return err; + } + if (vlan_mask->vlan_priority) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + vlan_key->vlan_priority); + if (err) + return err; + } + return 0; +} + static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -710,6 +770,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto))) goto nla_put_failure; + + if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, -- cgit v1.2.3 From 956af37102b515512331a03c35c958b2a1d8dd87 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 17 Aug 2016 13:36:14 +0300 Subject: net_sched: act_vlan: Add priority option The current vlan push action supports only vid and protocol options. Add priority option. Example script that adds vlan push action with vid and priority: tc filter add dev veth0 protocol ip parent ffff: \ flower \ indev veth0 \ action vlan push id 100 priority 5 Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/tc_act/tc_vlan.h | 1 + include/uapi/linux/tc_act/tc_vlan.h | 1 + net/sched/act_vlan.c | 13 +++++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index e29f52e8bdf1..6b835889ea30 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -20,6 +20,7 @@ struct tcf_vlan { int tcfv_action; u16 tcfv_push_vid; __be16 tcfv_push_proto; + u8 tcfv_push_prio; }; #define to_vlan(a) ((struct tcf_vlan *)a) diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index 31151ff6264f..be72b6e3843b 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -29,6 +29,7 @@ enum { TCA_VLAN_PUSH_VLAN_ID, TCA_VLAN_PUSH_VLAN_PROTOCOL, TCA_VLAN_PAD, + TCA_VLAN_PUSH_VLAN_PRIORITY, __TCA_VLAN_MAX, }; #define TCA_VLAN_MAX (__TCA_VLAN_MAX - 1) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 691409de3e1a..59a8d3150ae2 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -43,7 +43,8 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_VLAN_ACT_PUSH: - err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid); + err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid | + (v->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; @@ -65,6 +66,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, + [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, @@ -78,6 +80,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; + u8 push_prio = 0; bool exists = false; int ret = 0, err; @@ -123,6 +126,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } else { push_proto = htons(ETH_P_8021Q); } + + if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) + push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; default: if (exists) @@ -150,6 +156,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v->tcfv_action = action; v->tcfv_push_vid = push_vid; + v->tcfv_push_prio = push_prio; v->tcfv_push_proto = push_proto; v->tcf_action = parm->action; @@ -181,7 +188,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (v->tcfv_action == TCA_VLAN_ACT_PUSH && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, - v->tcfv_push_proto))) + v->tcfv_push_proto) || + (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, + v->tcfv_push_prio)))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); -- cgit v1.2.3 From 61ba1a2da9693b88bf5f2bb8e7a99a29cd139122 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 17 Aug 2016 12:53:10 +0200 Subject: net: bridge: export vlan flags with the stats Use one of the vlan xstats padding fields to export the vlan flags. This is needed in order to be able to distinguish between master (bridge) and port vlan entries in user-space when dumping the bridge vlan stats. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 +- net/bridge/br_netlink.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c186f64fffca..ab92bca6d448 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -140,7 +140,7 @@ struct bridge_vlan_xstats { __u64 tx_bytes; __u64 tx_packets; __u16 vid; - __u16 pad1; + __u16 flags; __u32 pad2; }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 493ab9b3d51a..872d4c0deb59 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1321,6 +1321,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; + vxi.flags = v->flags; br_vlan_get_stats(v, &stats); vxi.rx_bytes = stats.rx_bytes; vxi.rx_packets = stats.rx_packets; -- cgit v1.2.3 From 1cb94db3d1bfe0075bde78fb2989f17e0a8a3936 Mon Sep 17 00:00:00 2001 From: RafaÅ‚ MiÅ‚ecki Date: Wed, 17 Aug 2016 23:00:30 +0200 Subject: net: bgmac: support Ethernet core on BCM53573 SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM53573 is a new series of Broadcom's SoCs. It's based on ARM and can be found in two packages (versions): BCM53573 and BCM47189. It shares some code with the Northstar family, but also requires some new quirks. First of all there can be up to 2 Ethernet cores on this SoC. If that is the case, they are connected to two different switch ports allowing some more complex/optimized setups. It seems the second unit doesn't come fully configured and requires some IRQ quirk. Other than that only the first core is connected to the PHY. For the second one we have to register fixed PHY (similarly to the Northstar), otherwise generic PHY driver would get some invalid info. This has been successfully tested on Tenda AC9 (BCM47189B0). Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bgmac-bcma.c | 19 ++++++++++++++++++- drivers/net/ethernet/broadcom/bgmac.c | 25 +++++++++++++++++++++++++ drivers/net/ethernet/broadcom/bgmac.h | 19 +++++++++++++++++++ include/linux/bcma/bcma.h | 3 +++ include/linux/bcma/bcma_regs.h | 1 + 5 files changed, 66 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 9a9745c4047c..3bc0a04df107 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -92,6 +92,7 @@ MODULE_DEVICE_TABLE(bcma, bgmac_bcma_tbl); /* http://bcm-v4.sipsolutions.net/mac-gbit/gmac/chipattach */ static int bgmac_probe(struct bcma_device *core) { + struct bcma_chipinfo *ci = &core->bus->chipinfo; struct ssb_sprom *sprom = &core->bus->sprom; struct mii_bus *mii_bus; struct bgmac *bgmac; @@ -157,7 +158,8 @@ static int bgmac_probe(struct bcma_device *core) dev_info(bgmac->dev, "Found PHY addr: %d%s\n", bgmac->phyaddr, bgmac->phyaddr == BGMAC_PHY_NOREGS ? " (NOREGS)" : ""); - if (!bgmac_is_bcm4707_family(core)) { + if (!bgmac_is_bcm4707_family(core) && + !(ci->id == BCMA_CHIP_ID_BCM53573 && core->core_unit == 1)) { mii_bus = bcma_mdio_mii_register(core, bgmac->phyaddr); if (!IS_ERR(mii_bus)) { err = PTR_ERR(mii_bus); @@ -230,6 +232,21 @@ static int bgmac_probe(struct bcma_device *core) bgmac->feature_flags |= BGMAC_FEAT_NO_RESET; bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500; break; + case BCMA_CHIP_ID_BCM53573: + bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; + bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK; + if (ci->pkg == BCMA_PKG_ID_BCM47189) + bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED; + if (core->core_unit == 0) { + bgmac->feature_flags |= BGMAC_FEAT_CC4_IF_SW_TYPE; + if (ci->pkg == BCMA_PKG_ID_BCM47189) + bgmac->feature_flags |= + BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII; + } else if (core->core_unit == 1) { + bgmac->feature_flags |= BGMAC_FEAT_IRQ_ID_OOB_6; + bgmac->feature_flags |= BGMAC_FEAT_CC7_IF_TYPE_RGMII; + } + break; default: bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK; diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index c4751ece76f6..63ef72355833 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -940,6 +940,27 @@ static void bgmac_chip_reset(struct bgmac *bgmac) bgmac_cco_ctl_maskset(bgmac, 1, ~(BGMAC_CHIPCTL_1_IF_TYPE_MASK | BGMAC_CHIPCTL_1_SW_TYPE_MASK), sw_type); + } else if (bgmac->feature_flags & BGMAC_FEAT_CC4_IF_SW_TYPE) { + u32 sw_type = BGMAC_CHIPCTL_4_IF_TYPE_MII | + BGMAC_CHIPCTL_4_SW_TYPE_EPHY; + u8 et_swtype = 0; + char buf[4]; + + if (bcm47xx_nvram_getenv("et_swtype", buf, sizeof(buf)) > 0) { + if (kstrtou8(buf, 0, &et_swtype)) + dev_err(bgmac->dev, "Failed to parse et_swtype (%s)\n", + buf); + sw_type = (et_swtype & 0x0f) << 12; + } else if (bgmac->feature_flags & BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII) { + sw_type = BGMAC_CHIPCTL_4_IF_TYPE_RGMII | + BGMAC_CHIPCTL_4_SW_TYPE_RGMII; + } + bgmac_cco_ctl_maskset(bgmac, 4, ~(BGMAC_CHIPCTL_4_IF_TYPE_MASK | + BGMAC_CHIPCTL_4_SW_TYPE_MASK), + sw_type); + } else if (bgmac->feature_flags & BGMAC_FEAT_CC7_IF_TYPE_RGMII) { + bgmac_cco_ctl_maskset(bgmac, 7, ~BGMAC_CHIPCTL_7_IF_TYPE_MASK, + BGMAC_CHIPCTL_7_IF_TYPE_RGMII); } if (iost & BGMAC_BCMA_IOST_ATTACHED && !bgmac->has_robosw) @@ -1467,6 +1488,10 @@ int bgmac_enet_probe(struct bgmac *info) */ bgmac_clk_enable(bgmac, 0); + /* This seems to be fixing IRQ by assigning OOB #6 to the core */ + if (bgmac->feature_flags & BGMAC_FEAT_IRQ_ID_OOB_6) + bgmac_idm_write(bgmac, BCMA_OOB_SEL_OUT_A30, 0x86); + bgmac_chip_reset(bgmac); err = bgmac_dma_alloc(bgmac); diff --git a/drivers/net/ethernet/broadcom/bgmac.h b/drivers/net/ethernet/broadcom/bgmac.h index 24a250267b88..80836b4c9f38 100644 --- a/drivers/net/ethernet/broadcom/bgmac.h +++ b/drivers/net/ethernet/broadcom/bgmac.h @@ -369,6 +369,21 @@ #define BGMAC_CHIPCTL_1_SW_TYPE_RGMII 0x000000C0 #define BGMAC_CHIPCTL_1_RXC_DLL_BYPASS 0x00010000 +#define BGMAC_CHIPCTL_4_IF_TYPE_MASK 0x00003000 +#define BGMAC_CHIPCTL_4_IF_TYPE_RMII 0x00000000 +#define BGMAC_CHIPCTL_4_IF_TYPE_MII 0x00001000 +#define BGMAC_CHIPCTL_4_IF_TYPE_RGMII 0x00002000 +#define BGMAC_CHIPCTL_4_SW_TYPE_MASK 0x0000C000 +#define BGMAC_CHIPCTL_4_SW_TYPE_EPHY 0x00000000 +#define BGMAC_CHIPCTL_4_SW_TYPE_EPHYMII 0x00004000 +#define BGMAC_CHIPCTL_4_SW_TYPE_EPHYRMII 0x00008000 +#define BGMAC_CHIPCTL_4_SW_TYPE_RGMII 0x0000C000 + +#define BGMAC_CHIPCTL_7_IF_TYPE_MASK 0x000000C0 +#define BGMAC_CHIPCTL_7_IF_TYPE_RMII 0x00000000 +#define BGMAC_CHIPCTL_7_IF_TYPE_MII 0x00000040 +#define BGMAC_CHIPCTL_7_IF_TYPE_RGMII 0x00000080 + #define BGMAC_WEIGHT 64 #define ETHER_MAX_LEN 1518 @@ -390,6 +405,10 @@ #define BGMAC_FEAT_NO_CLR_MIB BIT(13) #define BGMAC_FEAT_FORCE_SPEED_2500 BIT(14) #define BGMAC_FEAT_CMDCFG_SR_REV4 BIT(15) +#define BGMAC_FEAT_IRQ_ID_OOB_6 BIT(16) +#define BGMAC_FEAT_CC4_IF_SW_TYPE BIT(17) +#define BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII BIT(18) +#define BGMAC_FEAT_CC7_IF_TYPE_RGMII BIT(19) struct bgmac_slot_info { union { diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3db25df396cb..8eeedb2db924 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -205,6 +205,9 @@ struct bcma_host_ops { #define BCMA_PKG_ID_BCM4709 0 #define BCMA_CHIP_ID_BCM47094 53030 #define BCMA_CHIP_ID_BCM53018 53018 +#define BCMA_CHIP_ID_BCM53573 53573 +#define BCMA_PKG_ID_BCM53573 0 +#define BCMA_PKG_ID_BCM47189 1 /* Board types (on PCI usually equals to the subsystem dev id) */ /* BCM4313 */ diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h index ebd5c1fcdea4..c607fce6aadd 100644 --- a/include/linux/bcma/bcma_regs.h +++ b/include/linux/bcma/bcma_regs.h @@ -23,6 +23,7 @@ #define BCMA_CLKCTLST_4328A0_HAVEALP 0x00020000 /* 4328a0 has reversed bits */ /* Agent registers (common for every core) */ +#define BCMA_OOB_SEL_OUT_A30 0x0100 #define BCMA_IOCTL 0x0408 /* IO control */ #define BCMA_IOCTL_CLK 0x0001 #define BCMA_IOCTL_FGC 0x0002 -- cgit v1.2.3 From b34040227be7da760cc72ef3c807e0985e7f0f16 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 18 Aug 2016 10:33:52 +0200 Subject: tipc: add peer removal functionality Add TIPC_NL_PEER_REMOVE netlink command. This command can remove an offline peer node from the internal data structures. This will be supported by the tipc user space tool in iproute2. Signed-off-by: Richard Alpe Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/net.h | 2 ++ net/tipc/netlink.c | 5 ++++ net/tipc/node.c | 63 +++++++++++++++++++++++++++++++++++++++ net/tipc/node.h | 1 + 5 files changed, 72 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 5f3f6d09fb79..bcb65ef725f6 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -59,6 +59,7 @@ enum { TIPC_NL_MON_SET, TIPC_NL_MON_GET, TIPC_NL_MON_PEER_GET, + TIPC_NL_PEER_REMOVE, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/net.h b/net/tipc/net.h index 77a7a118911d..c7c254902873 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -39,6 +39,8 @@ #include +extern const struct nla_policy tipc_nl_net_policy[]; + int tipc_net_start(struct net *net, u32 addr); void tipc_net_stop(struct net *net); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index a84daec0afe9..2718de667828 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -238,6 +238,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .dumpit = tipc_nl_node_dump_monitor_peer, .policy = tipc_nl_policy, }, + { + .cmd = TIPC_NL_PEER_REMOVE, + .doit = tipc_nl_peer_rm, + .policy = tipc_nl_policy, + } }; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) diff --git a/net/tipc/node.c b/net/tipc/node.c index 21974191e425..7e8b75fd1a02 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1553,6 +1553,69 @@ discard: kfree_skb(skb); } +int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; + struct tipc_node *peer; + u32 addr; + int err; + int i; + + /* We identify the peer by its net */ + if (!info->attrs[TIPC_NLA_NET]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, + info->attrs[TIPC_NLA_NET], + tipc_nl_net_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_NET_ADDR]) + return -EINVAL; + + addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); + + if (in_own_node(net, addr)) + return -ENOTSUPP; + + spin_lock_bh(&tn->node_list_lock); + peer = tipc_node_find(net, addr); + if (!peer) { + spin_unlock_bh(&tn->node_list_lock); + return -ENXIO; + } + + tipc_node_write_lock(peer); + if (peer->state != SELF_DOWN_PEER_DOWN && + peer->state != SELF_DOWN_PEER_LEAVING) { + tipc_node_write_unlock(peer); + err = -EBUSY; + goto err_out; + } + + for (i = 0; i < MAX_BEARERS; i++) { + struct tipc_link_entry *le = &peer->links[i]; + + if (le->link) { + kfree(le->link); + le->link = NULL; + peer->link_cnt--; + } + } + tipc_node_write_unlock(peer); + tipc_node_delete(peer); + + err = 0; +err_out: + tipc_node_put(peer); + spin_unlock_bh(&tn->node_list_lock); + + return err; +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/net/tipc/node.h b/net/tipc/node.h index d69fdfcc0ec9..4578b34c7dca 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -77,6 +77,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From 5293efe62df81908f2e90c9820c7edcc8e61f5e9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 Aug 2016 01:00:39 +0200 Subject: bpf: add bpf_skb_change_tail helper This work adds a bpf_skb_change_tail() helper for tc BPF programs. The basic idea is to expand or shrink the skb in a controlled manner. The eBPF program can then rewrite the rest via helpers like bpf_skb_store_bytes(), bpf_lX_csum_replace() and others rather than passing a raw buffer for writing here. bpf_skb_change_tail() is really a slow path helper and intended for replies with f.e. ICMP control messages. Concept is similar to other helpers like bpf_skb_change_proto() helper to keep the helper without protocol specifics and let the BPF program mangle the remaining parts. A flags field has been added and is reserved for now should we extend the helper in future. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 43 +++++++++++++++++++- include/uapi/linux/bpf.h | 11 ++++++ net/core/filter.c | 100 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 150 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0f665cb26b50..7047448e8129 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) int ___pskb_trim(struct sk_buff *skb, unsigned int len); -static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (unlikely(skb_is_nonlinear(skb))) { WARN_ON(1); @@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len) skb_set_tail_pointer(skb, len); } +static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +{ + __skb_set_length(skb, len); +} + void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) @@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) BUG_ON(err); } +static inline int __skb_grow(struct sk_buff *skb, unsigned int len) +{ + unsigned int diff = len - skb->len; + + if (skb_tailroom(skb) < diff) { + int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), + GFP_ATOMIC); + if (ret) + return ret; + } + __skb_set_length(skb, len); + return 0; +} + /** * skb_orphan - orphan a buffer * @skb: buffer to orphan @@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) return __pskb_trim(skb, len); } +static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + __skb_trim(skb, len); + return 0; +} + +static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + return __skb_grow(skb, len); +} + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ @@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } +static inline void skb_gso_reset(struct sk_buff *skb) +{ + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; +} + void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 866d53c33298..e4c5a1baa993 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -386,6 +386,17 @@ enum bpf_func_id { */ BPF_FUNC_current_task_under_cgroup, + /** + * bpf_skb_change_tail(skb, len, flags) + * The helper will resize the skb to the given new size, + * to be used f.e. with control messages. + * @skb: pointer to skb + * @len: new skb length + * @flags: reserved + * Return: 0 on success or negative error + */ + BPF_FUNC_skb_change_tail, + __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 58b5e6dd25fe..abf546d96b6b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1350,14 +1350,18 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err; + int err = __bpf_try_make_writable(skb, write_len); - err = skb_ensure_writable(skb, write_len); bpf_compute_data_end(skb); - return err; } @@ -1992,6 +1996,92 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 __bpf_skb_min_len(const struct sk_buff *skb) +{ + u32 min_len = skb_network_offset(skb); + + if (skb_transport_header_was_set(skb)) + min_len = skb_transport_offset(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + min_len = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + return min_len; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ + return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : + 65536; +} + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(long) r1; + u32 max_len = __bpf_skb_max_len(skb); + u32 min_len = __bpf_skb_min_len(skb); + u32 new_len = (u32) r2; + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push) @@ -2002,6 +2092,8 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_change_proto) return true; + if (func == bpf_skb_change_tail) + return true; if (func == bpf_l3_csum_replace) return true; if (func == bpf_l4_csum_replace) @@ -2368,6 +2460,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: -- cgit v1.2.3 From 54fd9c2dff144ed287ab3b8189dcdcd4d298d0cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 Aug 2016 01:00:41 +0200 Subject: bpf: get rid of cgroup helper related ifdefs As recently discussed during the task_under_cgroup_hierarchy() addition, we should get rid of the ifdefs surrounding the bpf_skb_under_cgroup() helper. If related functionality is not built-in, the helper cannot be used anyway, which is also in line with what we do for all other helpers. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ net/core/filter.c | 6 +----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ff5be7e8ddea..2aab9b63bf16 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1114,6 +1114,16 @@ static inline bool sk_stream_is_writeable(const struct sock *sk) sk_stream_memory_free(sk); } +static inline int sk_under_cgroup_hierarchy(struct sock *sk, + struct cgroup *ancestor) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), + ancestor); +#else + return -ENOTSUPP; +#endif +} static inline bool sk_has_memory_pressure(const struct sock *sk) { diff --git a/net/core/filter.c b/net/core/filter.c index 3b60dfd2ce92..a83766be1ad2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2374,7 +2374,6 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -#ifdef CONFIG_SOCK_CGROUP_DATA static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *)(long)r1; @@ -2395,7 +2394,7 @@ static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) if (unlikely(!cgrp)) return -EAGAIN; - return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); + return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { @@ -2406,7 +2405,6 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; -#endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) @@ -2515,10 +2513,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; -#ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; -#endif default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 246779dd090bd1b74d2652b3a6ca7759f593b27a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Aug 2016 16:50:56 +0800 Subject: rhashtable: Remove GFP flag from rhashtable_walk_init The commit 8f6fd83c6c5ec66a4a70c728535ddcdfef4f3697 ("rhashtable: accept GFP flags in rhashtable_walk_init") added a GFP flag argument to rhashtable_walk_init because some users wish to use the walker in an unsleepable context. In fact we don't need to allocate memory in rhashtable_walk_init at all. The walker is always paired with an iterator so we could just stash ourselves there. This patch does that by introducing a new enter function to replace the existing init function. This way we don't have to churn all the existing users again. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 14 +++++++++++--- lib/rhashtable.c | 46 ++++++++++++++++++---------------------------- 2 files changed, 29 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3eef0802a0cd..8b72ee710f95 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -173,7 +173,7 @@ struct rhashtable_walker { struct rhashtable_iter { struct rhashtable *ht; struct rhash_head *p; - struct rhashtable_walker *walker; + struct rhashtable_walker walker; unsigned int slot; unsigned int skip; }; @@ -346,8 +346,8 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, struct bucket_table *old_tbl); int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp); +void rhashtable_walk_enter(struct rhashtable *ht, + struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU); void *rhashtable_walk_next(struct rhashtable_iter *iter); @@ -906,4 +906,12 @@ static inline int rhashtable_replace_fast( return err; } +/* Obsolete function, do not use in new code. */ +static inline int rhashtable_walk_init(struct rhashtable *ht, + struct rhashtable_iter *iter, gfp_t gfp) +{ + rhashtable_walk_enter(ht, iter); + return 0; +} + #endif /* _LINUX_RHASHTABLE_H */ diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 5ba520b544d7..97e3cf08142c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -489,10 +489,9 @@ exit: EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** - * rhashtable_walk_init - Initialise an iterator + * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator - * @gfp: GFP flags for allocations * * This function prepares a hash table walk. * @@ -507,30 +506,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * This function may sleep so you must not call it from interrupt * context or with spin locks held. * - * You must call rhashtable_walk_exit if this function returns - * successfully. + * You must call rhashtable_walk_exit after this function returns. */ -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp) +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; - iter->walker = kmalloc(sizeof(*iter->walker), gfp); - if (!iter->walker) - return -ENOMEM; - spin_lock(&ht->lock); - iter->walker->tbl = + iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); - list_add(&iter->walker->list, &iter->walker->tbl->walkers); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); - - return 0; } -EXPORT_SYMBOL_GPL(rhashtable_walk_init); +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator @@ -541,10 +532,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); - kfree(iter->walker); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); @@ -570,12 +560,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter) rcu_read_lock(); spin_lock(&ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&ht->lock); - if (!iter->walker->tbl) { - iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -597,7 +587,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; @@ -630,8 +620,8 @@ next: /* Ensure we see any new tables. */ smp_rmb(); - iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (iter->walker->tbl) { + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); @@ -651,7 +641,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; @@ -660,9 +650,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) spin_lock(&ht->lock); if (tbl->rehash < tbl->size) - list_add(&iter->walker->list, &tbl->walkers); + list_add(&iter->walker.list, &tbl->walkers); else - iter->walker->tbl = NULL; + iter->walker.tbl = NULL; spin_unlock(&ht->lock); iter->p = NULL; -- cgit v1.2.3 From ea825e70d0e0798eda3a57b05c90f21f5a369128 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Aug 2016 15:30:12 -0700 Subject: net: dsa: Export suspend/resume functions In preparation for allowing switch drivers to implement system-wide suspend/resume functions, export dsa_switch_suspend and dsa_switch_resume() such that these are callable from the appropriate driver specific suspend/resume functions. Reviewed-by: Andrew Lunn Tested-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 14 ++++++++++++++ net/dsa/dsa.c | 6 ++++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2217a3f817f8..d00c392bc9f8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -386,4 +386,18 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds, struct device_node *np); +#ifdef CONFIG_PM_SLEEP +int dsa_switch_suspend(struct dsa_switch *ds); +int dsa_switch_resume(struct dsa_switch *ds); +#else +static inline int dsa_switch_suspend(struct dsa_switch *ds) +{ + return 0; +} +static inline int dsa_switch_resume(struct dsa_switch *ds) +{ + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + #endif diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7e68bc6bc853..9f5b47200365 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -543,7 +543,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) } #ifdef CONFIG_PM_SLEEP -static int dsa_switch_suspend(struct dsa_switch *ds) +int dsa_switch_suspend(struct dsa_switch *ds) { int i, ret = 0; @@ -562,8 +562,9 @@ static int dsa_switch_suspend(struct dsa_switch *ds) return ret; } +EXPORT_SYMBOL_GPL(dsa_switch_suspend); -static int dsa_switch_resume(struct dsa_switch *ds) +int dsa_switch_resume(struct dsa_switch *ds) { int i, ret = 0; @@ -585,6 +586,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) return 0; } +EXPORT_SYMBOL_GPL(dsa_switch_resume); #endif /* platform driver init and cleanup *****************************************/ -- cgit v1.2.3 From 05fafbfb3d77f43ae18341ddc61eb5c477896778 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Fri, 19 Aug 2016 09:33:31 +0300 Subject: qed: utilize FW 8.10.10.0 This new firmware for the qed* adpaters fixes several issues: - Better blocking of malicious VFs. - After FLR, Tx-switching [internal routing] of packets might be incorrect. - Deletion of unicast MAC filters would sometime have side-effect of corrupting the MAC filters configred for a device. It also contains fixes for future qed* drivers that *hopefully* would be sent for review in the near future. In addition, it would allow driver some new functionality, including: - Allowing PF/VF driver compaitibility with old drivers [running pre-8.10.5.0 firmware]. - Better debug facilities. This would also bump the qed* driver versions to 8.10.9.20. Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed.h | 2 +- drivers/net/ethernet/qlogic/qed/qed_dev.c | 48 +- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 1424 +++++++++++++++--------- drivers/net/ethernet/qlogic/qed/qed_init_ops.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 28 - drivers/net/ethernet/qlogic/qed/qed_mcp.h | 2 - drivers/net/ethernet/qlogic/qed/qed_reg_addr.h | 6 + drivers/net/ethernet/qlogic/qed/qed_sriov.c | 13 +- drivers/net/ethernet/qlogic/qede/qede.h | 2 +- include/linux/qed/common_hsi.h | 361 +++++- include/linux/qed/eth_common.h | 155 +-- include/linux/qed/iscsi_common.h | 28 +- include/linux/qed/qed_chain.h | 13 - include/linux/qed/tcp_common.h | 16 +- 14 files changed, 1375 insertions(+), 725 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index d0beb5d10a02..9ba7a7ee3357 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -26,7 +26,7 @@ #include "qed_hsi.h" extern const struct qed_common_ops qed_common_ops_pass; -#define DRV_MODULE_VERSION "8.7.1.20" +#define DRV_MODULE_VERSION "8.10.9.20" #define MAX_HWFNS_PER_DEVICE (4) #define NAME_SIZE 16 diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 8117ddff501b..5ae27f2d2fa5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -772,6 +772,9 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn, concrete_fid = qed_vfid_to_concrete(p_hwfn, vf_id); qed_fid_pretend(p_hwfn, p_ptt, (u16) concrete_fid); qed_wr(p_hwfn, p_ptt, CCFC_REG_STRONG_ENABLE_VF, 0x1); + qed_wr(p_hwfn, p_ptt, CCFC_REG_WEAK_ENABLE_VF, 0x0); + qed_wr(p_hwfn, p_ptt, TCFC_REG_STRONG_ENABLE_VF, 0x1); + qed_wr(p_hwfn, p_ptt, TCFC_REG_WEAK_ENABLE_VF, 0x0); } /* pretend to original PF */ qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id); @@ -782,34 +785,8 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn, static int qed_hw_init_port(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, int hw_mode) { - int rc = 0; - - rc = qed_init_run(p_hwfn, p_ptt, PHASE_PORT, p_hwfn->port_id, hw_mode); - if (rc) - return rc; - - if (hw_mode & (1 << MODE_MF_SI)) { - u8 pf_id = 0; - - if (!qed_hw_init_first_eth(p_hwfn, p_ptt, &pf_id)) { - DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP, - "PF[%08x] is first eth on engine\n", pf_id); - - /* We should have configured BIT for ppfid, i.e., the - * relative function number in the port. But there's a - * bug in LLH in BB where the ppfid is actually engine - * based, so we need to take this into account. - */ - qed_wr(p_hwfn, p_ptt, - NIG_REG_LLH_TAGMAC_DEF_PF_VECTOR, 1 << pf_id); - } - - /* Take the protocol-based hit vector if there is a hit, - * otherwise take the other vector. - */ - qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_CLS_TYPE_DUALMODE, 0x2); - } - return rc; + return qed_init_run(p_hwfn, p_ptt, PHASE_PORT, + p_hwfn->port_id, hw_mode); } static int qed_hw_init_pf(struct qed_hwfn *p_hwfn, @@ -878,21 +855,6 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn, /* Pure runtime initializations - directly to the HW */ qed_int_igu_init_pure_rt(p_hwfn, p_ptt, true, true); - if (hw_mode & (1 << MODE_MF_SI)) { - u8 pf_id = 0; - u32 val = 0; - - if (!qed_hw_init_first_eth(p_hwfn, p_ptt, &pf_id)) { - if (p_hwfn->rel_pf_id == pf_id) { - DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP, - "PF[%d] is first ETH on engine\n", - pf_id); - val = 1; - } - qed_wr(p_hwfn, p_ptt, PRS_REG_MSG_INFO, val); - } - } - if (b_hw_start) { /* enable interrupts */ qed_int_igu_enable(p_hwfn, p_ptt, int_mode); diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index acf5da39ecb0..a67b3554aabd 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -536,6 +536,244 @@ struct core_conn_context { struct regpair ustorm_st_padding[2]; }; +enum core_error_handle { + LL2_DROP_PACKET, + LL2_DO_NOTHING, + LL2_ASSERT, + MAX_CORE_ERROR_HANDLE +}; + +enum core_event_opcode { + CORE_EVENT_TX_QUEUE_START, + CORE_EVENT_TX_QUEUE_STOP, + CORE_EVENT_RX_QUEUE_START, + CORE_EVENT_RX_QUEUE_STOP, + MAX_CORE_EVENT_OPCODE +}; + +enum core_l4_pseudo_checksum_mode { + CORE_L4_PSEUDO_CSUM_CORRECT_LENGTH, + CORE_L4_PSEUDO_CSUM_ZERO_LENGTH, + MAX_CORE_L4_PSEUDO_CHECKSUM_MODE +}; + +struct core_ll2_port_stats { + struct regpair gsi_invalid_hdr; + struct regpair gsi_invalid_pkt_length; + struct regpair gsi_unsupported_pkt_typ; + struct regpair gsi_crcchksm_error; +}; + +struct core_ll2_pstorm_per_queue_stat { + struct regpair sent_ucast_bytes; + struct regpair sent_mcast_bytes; + struct regpair sent_bcast_bytes; + struct regpair sent_ucast_pkts; + struct regpair sent_mcast_pkts; + struct regpair sent_bcast_pkts; +}; + +struct core_ll2_rx_prod { + __le16 bd_prod; + __le16 cqe_prod; + __le32 reserved; +}; + +struct core_ll2_tstorm_per_queue_stat { + struct regpair packet_too_big_discard; + struct regpair no_buff_discard; +}; + +struct core_ll2_ustorm_per_queue_stat { + struct regpair rcv_ucast_bytes; + struct regpair rcv_mcast_bytes; + struct regpair rcv_bcast_bytes; + struct regpair rcv_ucast_pkts; + struct regpair rcv_mcast_pkts; + struct regpair rcv_bcast_pkts; +}; + +enum core_ramrod_cmd_id { + CORE_RAMROD_UNUSED, + CORE_RAMROD_RX_QUEUE_START, + CORE_RAMROD_TX_QUEUE_START, + CORE_RAMROD_RX_QUEUE_STOP, + CORE_RAMROD_TX_QUEUE_STOP, + MAX_CORE_RAMROD_CMD_ID +}; + +enum core_roce_flavor_type { + CORE_ROCE, + CORE_RROCE, + MAX_CORE_ROCE_FLAVOR_TYPE +}; + +struct core_rx_action_on_error { + u8 error_type; +#define CORE_RX_ACTION_ON_ERROR_PACKET_TOO_BIG_MASK 0x3 +#define CORE_RX_ACTION_ON_ERROR_PACKET_TOO_BIG_SHIFT 0 +#define CORE_RX_ACTION_ON_ERROR_NO_BUFF_MASK 0x3 +#define CORE_RX_ACTION_ON_ERROR_NO_BUFF_SHIFT 2 +#define CORE_RX_ACTION_ON_ERROR_RESERVED_MASK 0xF +#define CORE_RX_ACTION_ON_ERROR_RESERVED_SHIFT 4 +}; + +struct core_rx_bd { + struct regpair addr; + __le16 reserved[4]; +}; + +struct core_rx_bd_with_buff_len { + struct regpair addr; + __le16 buff_length; + __le16 reserved[3]; +}; + +union core_rx_bd_union { + struct core_rx_bd rx_bd; + struct core_rx_bd_with_buff_len rx_bd_with_len; +}; + +struct core_rx_cqe_opaque_data { + __le32 data[2]; +}; + +enum core_rx_cqe_type { + CORE_RX_CQE_ILLIGAL_TYPE, + CORE_RX_CQE_TYPE_REGULAR, + CORE_RX_CQE_TYPE_GSI_OFFLOAD, + CORE_RX_CQE_TYPE_SLOW_PATH, + MAX_CORE_RX_CQE_TYPE +}; + +struct core_rx_fast_path_cqe { + u8 type; + u8 placement_offset; + struct parsing_and_err_flags parse_flags; + __le16 packet_length; + __le16 vlan; + struct core_rx_cqe_opaque_data opaque_data; + __le32 reserved[4]; +}; + +struct core_rx_gsi_offload_cqe { + u8 type; + u8 data_length_error; + struct parsing_and_err_flags parse_flags; + __le16 data_length; + __le16 vlan; + __le32 src_mac_addrhi; + __le16 src_mac_addrlo; + u8 reserved1[2]; + __le32 gid_dst[4]; +}; + +struct core_rx_slow_path_cqe { + u8 type; + u8 ramrod_cmd_id; + __le16 echo; + __le32 reserved1[7]; +}; + +union core_rx_cqe_union { + struct core_rx_fast_path_cqe rx_cqe_fp; + struct core_rx_gsi_offload_cqe rx_cqe_gsi; + struct core_rx_slow_path_cqe rx_cqe_sp; +}; + +struct core_rx_start_ramrod_data { + struct regpair bd_base; + struct regpair cqe_pbl_addr; + __le16 mtu; + __le16 sb_id; + u8 sb_index; + u8 complete_cqe_flg; + u8 complete_event_flg; + u8 drop_ttl0_flg; + __le16 num_of_pbl_pages; + u8 inner_vlan_removal_en; + u8 queue_id; + u8 main_func_queue; + u8 mf_si_bcast_accept_all; + u8 mf_si_mcast_accept_all; + struct core_rx_action_on_error action_on_error; + u8 gsi_offload_flag; + u8 reserved[7]; +}; + +struct core_rx_stop_ramrod_data { + u8 complete_cqe_flg; + u8 complete_event_flg; + u8 queue_id; + u8 reserved1; + __le16 reserved2[2]; +}; + +struct core_tx_bd_flags { + u8 as_bitfield; +#define CORE_TX_BD_FLAGS_FORCE_VLAN_MODE_MASK 0x1 +#define CORE_TX_BD_FLAGS_FORCE_VLAN_MODE_SHIFT 0 +#define CORE_TX_BD_FLAGS_VLAN_INSERTION_MASK 0x1 +#define CORE_TX_BD_FLAGS_VLAN_INSERTION_SHIFT 1 +#define CORE_TX_BD_FLAGS_START_BD_MASK 0x1 +#define CORE_TX_BD_FLAGS_START_BD_SHIFT 2 +#define CORE_TX_BD_FLAGS_IP_CSUM_MASK 0x1 +#define CORE_TX_BD_FLAGS_IP_CSUM_SHIFT 3 +#define CORE_TX_BD_FLAGS_L4_CSUM_MASK 0x1 +#define CORE_TX_BD_FLAGS_L4_CSUM_SHIFT 4 +#define CORE_TX_BD_FLAGS_IPV6_EXT_MASK 0x1 +#define CORE_TX_BD_FLAGS_IPV6_EXT_SHIFT 5 +#define CORE_TX_BD_FLAGS_L4_PROTOCOL_MASK 0x1 +#define CORE_TX_BD_FLAGS_L4_PROTOCOL_SHIFT 6 +#define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_MASK 0x1 +#define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_SHIFT 7 +}; + +struct core_tx_bd { + struct regpair addr; + __le16 nbytes; + __le16 nw_vlan_or_lb_echo; + u8 bitfield0; +#define CORE_TX_BD_NBDS_MASK 0xF +#define CORE_TX_BD_NBDS_SHIFT 0 +#define CORE_TX_BD_ROCE_FLAV_MASK 0x1 +#define CORE_TX_BD_ROCE_FLAV_SHIFT 4 +#define CORE_TX_BD_RESERVED0_MASK 0x7 +#define CORE_TX_BD_RESERVED0_SHIFT 5 + struct core_tx_bd_flags bd_flags; + __le16 bitfield1; +#define CORE_TX_BD_L4_HDR_OFFSET_W_MASK 0x3FFF +#define CORE_TX_BD_L4_HDR_OFFSET_W_SHIFT 0 +#define CORE_TX_BD_TX_DST_MASK 0x1 +#define CORE_TX_BD_TX_DST_SHIFT 14 +#define CORE_TX_BD_RESERVED1_MASK 0x1 +#define CORE_TX_BD_RESERVED1_SHIFT 15 +}; + +enum core_tx_dest { + CORE_TX_DEST_NW, + CORE_TX_DEST_LB, + MAX_CORE_TX_DEST +}; + +struct core_tx_start_ramrod_data { + struct regpair pbl_base_addr; + __le16 mtu; + __le16 sb_id; + u8 sb_index; + u8 stats_en; + u8 stats_id; + u8 conn_type; + __le16 pbl_size; + __le16 qm_pq_id; + u8 gsi_offload_flag; + u8 resrved[3]; +}; + +struct core_tx_stop_ramrod_data { + __le32 reserved0[2]; +}; + struct eth_mstorm_per_pf_stat { struct regpair gre_discard_pkts; struct regpair vxlan_discard_pkts; @@ -636,9 +874,33 @@ struct hsi_fp_ver_struct { }; /* Mstorm non-triggering VF zone */ +enum malicious_vf_error_id { + MALICIOUS_VF_NO_ERROR, + VF_PF_CHANNEL_NOT_READY, + VF_ZONE_MSG_NOT_VALID, + VF_ZONE_FUNC_NOT_ENABLED, + ETH_PACKET_TOO_SMALL, + ETH_ILLEGAL_VLAN_MODE, + ETH_MTU_VIOLATION, + ETH_ILLEGAL_INBAND_TAGS, + ETH_VLAN_INSERT_AND_INBAND_VLAN, + ETH_ILLEGAL_NBDS, + ETH_FIRST_BD_WO_SOP, + ETH_INSUFFICIENT_BDS, + ETH_ILLEGAL_LSO_HDR_NBDS, + ETH_ILLEGAL_LSO_MSS, + ETH_ZERO_SIZE_BD, + ETH_ILLEGAL_LSO_HDR_LEN, + ETH_INSUFFICIENT_PAYLOAD, + ETH_EDPM_OUT_OF_SYNC, + ETH_TUNN_IPV6_EXT_NBD_ERR, + ETH_CONTROL_PACKET_VIOLATION, + MAX_MALICIOUS_VF_ERROR_ID +}; + struct mstorm_non_trigger_vf_zone { struct eth_mstorm_per_queue_stat eth_queue_stat; - struct eth_rx_prod_data eth_rx_queue_producers[ETH_MAX_NUM_RX_QUEUES_PER_VF]; + struct eth_rx_prod_data eth_rx_queue_producers[ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD]; }; /* Mstorm VF zone */ @@ -705,13 +967,17 @@ struct pf_start_ramrod_data { struct protocol_dcb_data { u8 dcb_enable_flag; + u8 reserved_a; u8 dcb_priority; u8 dcb_tc; - u8 reserved; + u8 reserved_b; + u8 reserved0; }; struct pf_update_tunnel_config { u8 update_rx_pf_clss; + u8 update_rx_def_ucast_clss; + u8 update_rx_def_non_ucast_clss; u8 update_tx_pf_clss; u8 set_vxlan_udp_port_flg; u8 set_geneve_udp_port_flg; @@ -727,7 +993,7 @@ struct pf_update_tunnel_config { u8 tunnel_clss_ipgre; __le16 vxlan_udp_port; __le16 geneve_udp_port; - __le16 reserved[3]; + __le16 reserved[2]; }; struct pf_update_ramrod_data { @@ -736,16 +1002,17 @@ struct pf_update_ramrod_data { u8 update_fcoe_dcb_data_flag; u8 update_iscsi_dcb_data_flag; u8 update_roce_dcb_data_flag; + u8 update_rroce_dcb_data_flag; u8 update_iwarp_dcb_data_flag; u8 update_mf_vlan_flag; - u8 reserved; struct protocol_dcb_data eth_dcb_data; struct protocol_dcb_data fcoe_dcb_data; struct protocol_dcb_data iscsi_dcb_data; struct protocol_dcb_data roce_dcb_data; + struct protocol_dcb_data rroce_dcb_data; struct protocol_dcb_data iwarp_dcb_data; __le16 mf_vlan; - __le16 reserved2; + __le16 reserved; struct pf_update_tunnel_config tunnel_config; }; @@ -766,10 +1033,14 @@ enum protocol_version_array_key { MAX_PROTOCOL_VERSION_ARRAY_KEY }; -/* Pstorm non-triggering VF zone */ +struct rdma_sent_stats { + struct regpair sent_bytes; + struct regpair sent_pkts; +}; + struct pstorm_non_trigger_vf_zone { struct eth_pstorm_per_queue_stat eth_queue_stat; - struct regpair reserved[2]; + struct rdma_sent_stats rdma_stats; }; /* Pstorm VF zone */ @@ -786,7 +1057,11 @@ struct ramrod_header { __le16 echo; }; -/* Slowpath Element (SPQE) */ +struct rdma_rcv_stats { + struct regpair rcv_bytes; + struct regpair rcv_pkts; +}; + struct slow_path_element { struct ramrod_header hdr; struct regpair data_ptr; @@ -794,7 +1069,7 @@ struct slow_path_element { /* Tstorm non-triggering VF zone */ struct tstorm_non_trigger_vf_zone { - struct regpair reserved[2]; + struct rdma_rcv_stats rdma_stats; }; struct tstorm_per_port_stat { @@ -802,9 +1077,14 @@ struct tstorm_per_port_stat { struct regpair mac_error_discard; struct regpair mftag_filter_discard; struct regpair eth_mac_filter_discard; - struct regpair reserved[5]; + struct regpair ll2_mac_filter_discard; + struct regpair ll2_conn_disabled_discard; + struct regpair iscsi_irregular_pkt; + struct regpair reserved; + struct regpair roce_irregular_pkt; struct regpair eth_irregular_pkt; - struct regpair reserved1[2]; + struct regpair reserved1; + struct regpair preroce_irregular_pkt; struct regpair eth_gre_tunn_filter_discard; struct regpair eth_vxlan_tunn_filter_discard; struct regpair eth_geneve_tunn_filter_discard; @@ -870,7 +1150,13 @@ struct vf_stop_ramrod_data { __le32 reserved2; }; -/* Attentions status block */ +enum vf_zone_size_mode { + VF_ZONE_SIZE_MODE_DEFAULT, + VF_ZONE_SIZE_MODE_DOUBLE, + VF_ZONE_SIZE_MODE_QUAD, + MAX_VF_ZONE_SIZE_MODE +}; + struct atten_status_block { __le32 atten_bits; __le32 atten_ack; @@ -1579,6 +1865,7 @@ enum dbg_status { DBG_STATUS_REG_FIFO_BAD_DATA, DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA, DBG_STATUS_DBG_ARRAY_NOT_SET, + DBG_STATUS_MULTI_BLOCKS_WITH_FILTER, MAX_DBG_STATUS }; @@ -1589,7 +1876,41 @@ enum dbg_status { /* Number of VLAN priorities */ #define NUM_OF_VLAN_PRIORITIES 8 -/* QM per-port init parameters */ +struct init_brb_ram_req { + __le32 guranteed_per_tc; + __le32 headroom_per_tc; + __le32 min_pkt_size; + __le32 max_ports_per_engine; + u8 num_active_tcs[MAX_NUM_PORTS]; +}; + +struct init_ets_tc_req { + u8 use_sp; + u8 use_wfq; + __le16 weight; +}; + +struct init_ets_req { + __le32 mtu; + struct init_ets_tc_req tc_req[NUM_OF_TCS]; +}; + +struct init_nig_lb_rl_req { + __le16 lb_mac_rate; + __le16 lb_rate; + __le32 mtu; + __le16 tc_rate[NUM_OF_PHYS_TCS]; +}; + +struct init_nig_pri_tc_map_entry { + u8 tc_id; + u8 valid; +}; + +struct init_nig_pri_tc_map_req { + struct init_nig_pri_tc_map_entry pri[NUM_OF_VLAN_PRIORITIES]; +}; + struct init_qm_port_params { u8 active; u8 active_phys_tcs; @@ -1619,7 +1940,7 @@ struct init_qm_vport_params { /* Width of GRC address in bits (addresses are specified in dwords) */ #define GRC_ADDR_BITS 23 -#define MAX_GRC_ADDR ((1 << GRC_ADDR_BITS) - 1) +#define MAX_GRC_ADDR (BIT(GRC_ADDR_BITS) - 1) /* indicates an init that should be applied to any phase ID */ #define ANY_PHASE_ID 0xffff @@ -1674,11 +1995,11 @@ struct bin_buffer_hdr { /* binary init buffer types */ enum bin_init_buffer_type { - BIN_BUF_FW_VER_INFO, + BIN_BUF_INIT_FW_VER_INFO, BIN_BUF_INIT_CMD, BIN_BUF_INIT_VAL, BIN_BUF_INIT_MODE_TREE, - BIN_BUF_IRO, + BIN_BUF_INIT_IRO, MAX_BIN_INIT_BUFFER_TYPE }; @@ -1918,44 +2239,34 @@ enum dbg_status qed_dbg_print_attn(struct qed_hwfn *p_hwfn, #define MAX_NAME_LEN 16 /* Win 2 */ -#define GTT_BAR0_MAP_REG_IGU_CMD \ - 0x00f000UL +#define GTT_BAR0_MAP_REG_IGU_CMD 0x00f000UL /* Win 3 */ -#define GTT_BAR0_MAP_REG_TSDM_RAM \ - 0x010000UL +#define GTT_BAR0_MAP_REG_TSDM_RAM 0x010000UL /* Win 4 */ -#define GTT_BAR0_MAP_REG_MSDM_RAM \ - 0x011000UL +#define GTT_BAR0_MAP_REG_MSDM_RAM 0x011000UL /* Win 5 */ -#define GTT_BAR0_MAP_REG_MSDM_RAM_1024 \ - 0x012000UL +#define GTT_BAR0_MAP_REG_MSDM_RAM_1024 0x012000UL /* Win 6 */ -#define GTT_BAR0_MAP_REG_USDM_RAM \ - 0x013000UL +#define GTT_BAR0_MAP_REG_USDM_RAM 0x013000UL /* Win 7 */ -#define GTT_BAR0_MAP_REG_USDM_RAM_1024 \ - 0x014000UL +#define GTT_BAR0_MAP_REG_USDM_RAM_1024 0x014000UL /* Win 8 */ -#define GTT_BAR0_MAP_REG_USDM_RAM_2048 \ - 0x015000UL +#define GTT_BAR0_MAP_REG_USDM_RAM_2048 0x015000UL /* Win 9 */ -#define GTT_BAR0_MAP_REG_XSDM_RAM \ - 0x016000UL +#define GTT_BAR0_MAP_REG_XSDM_RAM 0x016000UL /* Win 10 */ -#define GTT_BAR0_MAP_REG_YSDM_RAM \ - 0x017000UL +#define GTT_BAR0_MAP_REG_YSDM_RAM 0x017000UL /* Win 11 */ -#define GTT_BAR0_MAP_REG_PSDM_RAM \ - 0x018000UL +#define GTT_BAR0_MAP_REG_PSDM_RAM 0x018000UL /** * @brief qed_qm_pf_mem_size - prepare QM ILT sizes @@ -2003,7 +2314,7 @@ struct qed_qm_pf_rt_init_params { u16 num_vf_pqs; u8 start_vport; u8 num_vports; - u8 pf_wfq; + u16 pf_wfq; u32 pf_rl; struct init_qm_pq_params *pq_params; struct init_qm_vport_params *vport_params; @@ -2138,6 +2449,9 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn, #define TSTORM_PORT_STAT_OFFSET(port_id) \ (IRO[1].base + ((port_id) * IRO[1].m1)) #define TSTORM_PORT_STAT_SIZE (IRO[1].size) +#define TSTORM_LL2_PORT_STAT_OFFSET(port_id) \ + (IRO[2].base + ((port_id) * IRO[2].m1)) +#define TSTORM_LL2_PORT_STAT_SIZE (IRO[2].size) #define USTORM_VF_PF_CHANNEL_READY_OFFSET(vf_id) \ (IRO[3].base + ((vf_id) * IRO[3].m1)) #define USTORM_VF_PF_CHANNEL_READY_SIZE (IRO[3].size) @@ -2153,42 +2467,90 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn, #define USTORM_COMMON_QUEUE_CONS_OFFSET(queue_zone_id) \ (IRO[7].base + ((queue_zone_id) * IRO[7].m1)) #define USTORM_COMMON_QUEUE_CONS_SIZE (IRO[7].size) +#define TSTORM_LL2_RX_PRODS_OFFSET(core_rx_queue_id) \ + (IRO[14].base + ((core_rx_queue_id) * IRO[14].m1)) +#define TSTORM_LL2_RX_PRODS_SIZE (IRO[14].size) +#define CORE_LL2_TSTORM_PER_QUEUE_STAT_OFFSET(core_rx_queue_id) \ + (IRO[15].base + ((core_rx_queue_id) * IRO[15].m1)) +#define CORE_LL2_TSTORM_PER_QUEUE_STAT_SIZE (IRO[15].size) +#define CORE_LL2_USTORM_PER_QUEUE_STAT_OFFSET(core_rx_queue_id) \ + (IRO[16].base + ((core_rx_queue_id) * IRO[16].m1)) +#define CORE_LL2_USTORM_PER_QUEUE_STAT_SIZE (IRO[16].size) +#define CORE_LL2_PSTORM_PER_QUEUE_STAT_OFFSET(core_tx_stats_id) \ + (IRO[17].base + ((core_tx_stats_id) * IRO[17].m1)) +#define CORE_LL2_PSTORM_PER_QUEUE_STAT_SIZE (IRO[17]. size) #define MSTORM_QUEUE_STAT_OFFSET(stat_counter_id) \ (IRO[18].base + ((stat_counter_id) * IRO[18].m1)) #define MSTORM_QUEUE_STAT_SIZE (IRO[18].size) #define MSTORM_ETH_PF_PRODS_OFFSET(queue_id) \ (IRO[19].base + ((queue_id) * IRO[19].m1)) #define MSTORM_ETH_PF_PRODS_SIZE (IRO[19].size) -#define MSTORM_TPA_TIMEOUT_US_OFFSET (IRO[20].base) -#define MSTORM_TPA_TIMEOUT_US_SIZE (IRO[20].size) +#define MSTORM_ETH_VF_PRODS_OFFSET(vf_id, vf_queue_id) \ + (IRO[20].base + ((vf_id) * IRO[20].m1) + ((vf_queue_id) * IRO[20].m2)) +#define MSTORM_ETH_VF_PRODS_SIZE (IRO[20].size) +#define MSTORM_TPA_TIMEOUT_US_OFFSET (IRO[21].base) +#define MSTORM_TPA_TIMEOUT_US_SIZE (IRO[21].size) #define MSTORM_ETH_PF_STAT_OFFSET(pf_id) \ - (IRO[21].base + ((pf_id) * IRO[21].m1)) + (IRO[22].base + ((pf_id) * IRO[22].m1)) #define MSTORM_ETH_PF_STAT_SIZE (IRO[21].size) #define USTORM_QUEUE_STAT_OFFSET(stat_counter_id) \ - (IRO[22].base + ((stat_counter_id) * IRO[22].m1)) -#define USTORM_QUEUE_STAT_SIZE (IRO[22].size) + (IRO[23].base + ((stat_counter_id) * IRO[23].m1)) +#define USTORM_QUEUE_STAT_SIZE (IRO[23].size) #define USTORM_ETH_PF_STAT_OFFSET(pf_id) \ - (IRO[23].base + ((pf_id) * IRO[23].m1)) -#define USTORM_ETH_PF_STAT_SIZE (IRO[23].size) + (IRO[24].base + ((pf_id) * IRO[24].m1)) +#define USTORM_ETH_PF_STAT_SIZE (IRO[24].size) #define PSTORM_QUEUE_STAT_OFFSET(stat_counter_id) \ - (IRO[24].base + ((stat_counter_id) * IRO[24].m1)) -#define PSTORM_QUEUE_STAT_SIZE (IRO[24].size) + (IRO[25].base + ((stat_counter_id) * IRO[25].m1)) +#define PSTORM_QUEUE_STAT_SIZE (IRO[25].size) #define PSTORM_ETH_PF_STAT_OFFSET(pf_id) \ - (IRO[25].base + ((pf_id) * IRO[25].m1)) -#define PSTORM_ETH_PF_STAT_SIZE (IRO[25].size) + (IRO[26].base + ((pf_id) * IRO[26].m1)) +#define PSTORM_ETH_PF_STAT_SIZE (IRO[26].size) #define PSTORM_CTL_FRAME_ETHTYPE_OFFSET(ethtype) \ - (IRO[26].base + ((ethtype) * IRO[26].m1)) -#define PSTORM_CTL_FRAME_ETHTYPE_SIZE (IRO[26].size) -#define TSTORM_ETH_PRS_INPUT_OFFSET (IRO[27].base) -#define TSTORM_ETH_PRS_INPUT_SIZE (IRO[27].size) + (IRO[27].base + ((ethtype) * IRO[27].m1)) +#define PSTORM_CTL_FRAME_ETHTYPE_SIZE (IRO[27].size) +#define TSTORM_ETH_PRS_INPUT_OFFSET (IRO[28].base) +#define TSTORM_ETH_PRS_INPUT_SIZE (IRO[28].size) #define ETH_RX_RATE_LIMIT_OFFSET(pf_id) \ - (IRO[28].base + ((pf_id) * IRO[28].m1)) -#define ETH_RX_RATE_LIMIT_SIZE (IRO[28].size) + (IRO[29].base + ((pf_id) * IRO[29].m1)) +#define ETH_RX_RATE_LIMIT_SIZE (IRO[29].size) #define XSTORM_ETH_QUEUE_ZONE_OFFSET(queue_id) \ - (IRO[29].base + ((queue_id) * IRO[29].m1)) -#define XSTORM_ETH_QUEUE_ZONE_SIZE (IRO[29].size) - -static const struct iro iro_arr[46] = { + (IRO[30].base + ((queue_id) * IRO[30].m1)) +#define XSTORM_ETH_QUEUE_ZONE_SIZE (IRO[30].size) +#define TSTORM_SCSI_CMDQ_CONS_OFFSET(cmdq_queue_id) \ + (IRO[34].base + ((cmdq_queue_id) * IRO[34].m1)) +#define TSTORM_SCSI_CMDQ_CONS_SIZE (IRO[34].size) +#define TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \ + (IRO[35].base + ((func_id) * IRO[35].m1) + ((bdq_id) * IRO[35].m2)) +#define TSTORM_SCSI_BDQ_EXT_PROD_SIZE (IRO[35].size) +#define MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \ + (IRO[36].base + ((func_id) * IRO[36].m1) + ((bdq_id) * IRO[36].m2)) +#define MSTORM_SCSI_BDQ_EXT_PROD_SIZE (IRO[36].size) +#define TSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \ + (IRO[37].base + ((pf_id) * IRO[37].m1)) +#define TSTORM_ISCSI_RX_STATS_SIZE (IRO[37].size) +#define MSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \ + (IRO[38].base + ((pf_id) * IRO[38].m1)) +#define MSTORM_ISCSI_RX_STATS_SIZE (IRO[38].size) +#define USTORM_ISCSI_RX_STATS_OFFSET(pf_id) \ + (IRO[39].base + ((pf_id) * IRO[39].m1)) +#define USTORM_ISCSI_RX_STATS_SIZE (IRO[39].size) +#define XSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \ + (IRO[40].base + ((pf_id) * IRO[40].m1)) +#define XSTORM_ISCSI_TX_STATS_SIZE (IRO[40].size) +#define YSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \ + (IRO[41].base + ((pf_id) * IRO[41].m1)) +#define YSTORM_ISCSI_TX_STATS_SIZE (IRO[41].size) +#define PSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \ + (IRO[42].base + ((pf_id) * IRO[42].m1)) +#define PSTORM_ISCSI_TX_STATS_SIZE (IRO[42].size) +#define PSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \ + (IRO[45].base + ((rdma_stat_counter_id) * IRO[45].m1)) +#define PSTORM_RDMA_QUEUE_STAT_SIZE (IRO[45].size) +#define TSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \ + (IRO[46].base + ((rdma_stat_counter_id) * IRO[46].m1)) +#define TSTORM_RDMA_QUEUE_STAT_SIZE (IRO[46].size) + +static const struct iro iro_arr[47] = { {0x0, 0x0, 0x0, 0x0, 0x8}, {0x4cb0, 0x78, 0x0, 0x0, 0x78}, {0x6318, 0x20, 0x0, 0x0, 0x20}, @@ -2201,20 +2563,21 @@ static const struct iro iro_arr[46] = { {0x3df0, 0x0, 0x0, 0x0, 0x78}, {0x29b0, 0x0, 0x0, 0x0, 0x78}, {0x4c38, 0x0, 0x0, 0x0, 0x78}, - {0x4a48, 0x0, 0x0, 0x0, 0x78}, + {0x4990, 0x0, 0x0, 0x0, 0x78}, {0x7e48, 0x0, 0x0, 0x0, 0x78}, {0xa28, 0x8, 0x0, 0x0, 0x8}, {0x60f8, 0x10, 0x0, 0x0, 0x10}, {0xb820, 0x30, 0x0, 0x0, 0x30}, {0x95b8, 0x30, 0x0, 0x0, 0x30}, - {0x4c18, 0x80, 0x0, 0x0, 0x40}, + {0x4b60, 0x80, 0x0, 0x0, 0x40}, {0x1f8, 0x4, 0x0, 0x0, 0x4}, - {0xc9a8, 0x0, 0x0, 0x0, 0x4}, - {0x4c58, 0x80, 0x0, 0x0, 0x20}, + {0x53a0, 0x80, 0x4, 0x0, 0x4}, + {0xc8f0, 0x0, 0x0, 0x0, 0x4}, + {0x4ba0, 0x80, 0x0, 0x0, 0x20}, {0x8050, 0x40, 0x0, 0x0, 0x30}, {0xe770, 0x60, 0x0, 0x0, 0x60}, {0x2b48, 0x80, 0x0, 0x0, 0x38}, - {0xdf88, 0x78, 0x0, 0x0, 0x78}, + {0xf188, 0x78, 0x0, 0x0, 0x78}, {0x1f8, 0x4, 0x0, 0x0, 0x4}, {0xacf0, 0x0, 0x0, 0x0, 0xf0}, {0xade0, 0x8, 0x0, 0x0, 0x8}, @@ -2226,455 +2589,457 @@ static const struct iro iro_arr[46] = { {0x200, 0x10, 0x8, 0x0, 0x8}, {0xb78, 0x10, 0x8, 0x0, 0x2}, {0xd888, 0x38, 0x0, 0x0, 0x24}, - {0x12120, 0x10, 0x0, 0x0, 0x8}, - {0x11b20, 0x38, 0x0, 0x0, 0x18}, + {0x12c38, 0x10, 0x0, 0x0, 0x8}, + {0x11aa0, 0x38, 0x0, 0x0, 0x18}, {0xa8c0, 0x30, 0x0, 0x0, 0x10}, {0x86f8, 0x28, 0x0, 0x0, 0x18}, - {0xeff8, 0x10, 0x0, 0x0, 0x10}, + {0x101f8, 0x10, 0x0, 0x0, 0x10}, {0xdd08, 0x48, 0x0, 0x0, 0x38}, - {0xf460, 0x20, 0x0, 0x0, 0x20}, + {0x10660, 0x20, 0x0, 0x0, 0x20}, {0x2b80, 0x80, 0x0, 0x0, 0x10}, {0x5000, 0x10, 0x0, 0x0, 0x10}, }; /* Runtime array offsets */ -#define DORQ_REG_PF_MAX_ICID_0_RT_OFFSET 0 -#define DORQ_REG_PF_MAX_ICID_1_RT_OFFSET 1 -#define DORQ_REG_PF_MAX_ICID_2_RT_OFFSET 2 -#define DORQ_REG_PF_MAX_ICID_3_RT_OFFSET 3 -#define DORQ_REG_PF_MAX_ICID_4_RT_OFFSET 4 -#define DORQ_REG_PF_MAX_ICID_5_RT_OFFSET 5 -#define DORQ_REG_PF_MAX_ICID_6_RT_OFFSET 6 -#define DORQ_REG_PF_MAX_ICID_7_RT_OFFSET 7 -#define DORQ_REG_VF_MAX_ICID_0_RT_OFFSET 8 -#define DORQ_REG_VF_MAX_ICID_1_RT_OFFSET 9 -#define DORQ_REG_VF_MAX_ICID_2_RT_OFFSET 10 -#define DORQ_REG_VF_MAX_ICID_3_RT_OFFSET 11 -#define DORQ_REG_VF_MAX_ICID_4_RT_OFFSET 12 -#define DORQ_REG_VF_MAX_ICID_5_RT_OFFSET 13 -#define DORQ_REG_VF_MAX_ICID_6_RT_OFFSET 14 -#define DORQ_REG_VF_MAX_ICID_7_RT_OFFSET 15 -#define DORQ_REG_PF_WAKE_ALL_RT_OFFSET 16 -#define DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET 17 -#define IGU_REG_PF_CONFIGURATION_RT_OFFSET 18 -#define IGU_REG_VF_CONFIGURATION_RT_OFFSET 19 -#define IGU_REG_ATTN_MSG_ADDR_L_RT_OFFSET 20 -#define IGU_REG_ATTN_MSG_ADDR_H_RT_OFFSET 21 -#define IGU_REG_LEADING_EDGE_LATCH_RT_OFFSET 22 -#define IGU_REG_TRAILING_EDGE_LATCH_RT_OFFSET 23 -#define CAU_REG_CQE_AGG_UNIT_SIZE_RT_OFFSET 24 -#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET 761 -#define CAU_REG_SB_VAR_MEMORY_RT_SIZE 736 -#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET 761 -#define CAU_REG_SB_VAR_MEMORY_RT_SIZE 736 -#define CAU_REG_SB_ADDR_MEMORY_RT_OFFSET 1497 -#define CAU_REG_SB_ADDR_MEMORY_RT_SIZE 736 -#define CAU_REG_PI_MEMORY_RT_OFFSET 2233 -#define CAU_REG_PI_MEMORY_RT_SIZE 4416 -#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET 6649 -#define PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET 6650 -#define PRS_REG_TASK_ID_MAX_INITIATOR_VF_RT_OFFSET 6651 -#define PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET 6652 -#define PRS_REG_TASK_ID_MAX_TARGET_VF_RT_OFFSET 6653 -#define PRS_REG_SEARCH_TCP_RT_OFFSET 6654 -#define PRS_REG_SEARCH_FCOE_RT_OFFSET 6655 -#define PRS_REG_SEARCH_ROCE_RT_OFFSET 6656 -#define PRS_REG_ROCE_DEST_QP_MAX_VF_RT_OFFSET 6657 -#define PRS_REG_ROCE_DEST_QP_MAX_PF_RT_OFFSET 6658 -#define PRS_REG_SEARCH_OPENFLOW_RT_OFFSET 6659 -#define PRS_REG_SEARCH_NON_IP_AS_OPENFLOW_RT_OFFSET 6660 -#define PRS_REG_OPENFLOW_SUPPORT_ONLY_KNOWN_OVER_IP_RT_OFFSET 6661 -#define PRS_REG_OPENFLOW_SEARCH_KEY_MASK_RT_OFFSET 6662 -#define PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET 6663 -#define PRS_REG_LIGHT_L2_ETHERTYPE_EN_RT_OFFSET 6664 -#define SRC_REG_FIRSTFREE_RT_OFFSET 6665 -#define SRC_REG_FIRSTFREE_RT_SIZE 2 -#define SRC_REG_LASTFREE_RT_OFFSET 6667 -#define SRC_REG_LASTFREE_RT_SIZE 2 -#define SRC_REG_COUNTFREE_RT_OFFSET 6669 -#define SRC_REG_NUMBER_HASH_BITS_RT_OFFSET 6670 -#define PSWRQ2_REG_CDUT_P_SIZE_RT_OFFSET 6671 -#define PSWRQ2_REG_CDUC_P_SIZE_RT_OFFSET 6672 -#define PSWRQ2_REG_TM_P_SIZE_RT_OFFSET 6673 -#define PSWRQ2_REG_QM_P_SIZE_RT_OFFSET 6674 -#define PSWRQ2_REG_SRC_P_SIZE_RT_OFFSET 6675 -#define PSWRQ2_REG_TSDM_P_SIZE_RT_OFFSET 6676 -#define PSWRQ2_REG_TM_FIRST_ILT_RT_OFFSET 6677 -#define PSWRQ2_REG_TM_LAST_ILT_RT_OFFSET 6678 -#define PSWRQ2_REG_QM_FIRST_ILT_RT_OFFSET 6679 -#define PSWRQ2_REG_QM_LAST_ILT_RT_OFFSET 6680 -#define PSWRQ2_REG_SRC_FIRST_ILT_RT_OFFSET 6681 -#define PSWRQ2_REG_SRC_LAST_ILT_RT_OFFSET 6682 -#define PSWRQ2_REG_CDUC_FIRST_ILT_RT_OFFSET 6683 -#define PSWRQ2_REG_CDUC_LAST_ILT_RT_OFFSET 6684 -#define PSWRQ2_REG_CDUT_FIRST_ILT_RT_OFFSET 6685 -#define PSWRQ2_REG_CDUT_LAST_ILT_RT_OFFSET 6686 -#define PSWRQ2_REG_TSDM_FIRST_ILT_RT_OFFSET 6687 -#define PSWRQ2_REG_TSDM_LAST_ILT_RT_OFFSET 6688 -#define PSWRQ2_REG_TM_NUMBER_OF_PF_BLOCKS_RT_OFFSET 6689 -#define PSWRQ2_REG_CDUT_NUMBER_OF_PF_BLOCKS_RT_OFFSET 6690 -#define PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET 6691 -#define PSWRQ2_REG_TM_VF_BLOCKS_RT_OFFSET 6692 -#define PSWRQ2_REG_CDUT_VF_BLOCKS_RT_OFFSET 6693 -#define PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET 6694 -#define PSWRQ2_REG_TM_BLOCKS_FACTOR_RT_OFFSET 6695 -#define PSWRQ2_REG_CDUT_BLOCKS_FACTOR_RT_OFFSET 6696 -#define PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET 6697 -#define PSWRQ2_REG_VF_BASE_RT_OFFSET 6698 -#define PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET 6699 -#define PSWRQ2_REG_WR_MBS0_RT_OFFSET 6700 -#define PSWRQ2_REG_RD_MBS0_RT_OFFSET 6701 -#define PSWRQ2_REG_DRAM_ALIGN_WR_RT_OFFSET 6702 -#define PSWRQ2_REG_DRAM_ALIGN_RD_RT_OFFSET 6703 -#define PSWRQ2_REG_ILT_MEMORY_RT_OFFSET 6704 -#define PSWRQ2_REG_ILT_MEMORY_RT_SIZE 22000 -#define PGLUE_REG_B_VF_BASE_RT_OFFSET 28704 -#define PGLUE_REG_B_CACHE_LINE_SIZE_RT_OFFSET 28705 -#define PGLUE_REG_B_PF_BAR0_SIZE_RT_OFFSET 28706 -#define PGLUE_REG_B_PF_BAR1_SIZE_RT_OFFSET 28707 -#define PGLUE_REG_B_VF_BAR1_SIZE_RT_OFFSET 28708 -#define TM_REG_VF_ENABLE_CONN_RT_OFFSET 28709 -#define TM_REG_PF_ENABLE_CONN_RT_OFFSET 28710 -#define TM_REG_PF_ENABLE_TASK_RT_OFFSET 28711 -#define TM_REG_GROUP_SIZE_RESOLUTION_CONN_RT_OFFSET 28712 -#define TM_REG_GROUP_SIZE_RESOLUTION_TASK_RT_OFFSET 28713 -#define TM_REG_CONFIG_CONN_MEM_RT_OFFSET 28714 -#define TM_REG_CONFIG_CONN_MEM_RT_SIZE 416 -#define TM_REG_CONFIG_TASK_MEM_RT_OFFSET 29130 -#define TM_REG_CONFIG_TASK_MEM_RT_SIZE 512 -#define QM_REG_MAXPQSIZE_0_RT_OFFSET 29642 -#define QM_REG_MAXPQSIZE_1_RT_OFFSET 29643 -#define QM_REG_MAXPQSIZE_2_RT_OFFSET 29644 -#define QM_REG_MAXPQSIZETXSEL_0_RT_OFFSET 29645 -#define QM_REG_MAXPQSIZETXSEL_1_RT_OFFSET 29646 -#define QM_REG_MAXPQSIZETXSEL_2_RT_OFFSET 29647 -#define QM_REG_MAXPQSIZETXSEL_3_RT_OFFSET 29648 -#define QM_REG_MAXPQSIZETXSEL_4_RT_OFFSET 29649 -#define QM_REG_MAXPQSIZETXSEL_5_RT_OFFSET 29650 -#define QM_REG_MAXPQSIZETXSEL_6_RT_OFFSET 29651 -#define QM_REG_MAXPQSIZETXSEL_7_RT_OFFSET 29652 -#define QM_REG_MAXPQSIZETXSEL_8_RT_OFFSET 29653 -#define QM_REG_MAXPQSIZETXSEL_9_RT_OFFSET 29654 -#define QM_REG_MAXPQSIZETXSEL_10_RT_OFFSET 29655 -#define QM_REG_MAXPQSIZETXSEL_11_RT_OFFSET 29656 -#define QM_REG_MAXPQSIZETXSEL_12_RT_OFFSET 29657 -#define QM_REG_MAXPQSIZETXSEL_13_RT_OFFSET 29658 -#define QM_REG_MAXPQSIZETXSEL_14_RT_OFFSET 29659 -#define QM_REG_MAXPQSIZETXSEL_15_RT_OFFSET 29660 -#define QM_REG_MAXPQSIZETXSEL_16_RT_OFFSET 29661 -#define QM_REG_MAXPQSIZETXSEL_17_RT_OFFSET 29662 -#define QM_REG_MAXPQSIZETXSEL_18_RT_OFFSET 29663 -#define QM_REG_MAXPQSIZETXSEL_19_RT_OFFSET 29664 -#define QM_REG_MAXPQSIZETXSEL_20_RT_OFFSET 29665 -#define QM_REG_MAXPQSIZETXSEL_21_RT_OFFSET 29666 -#define QM_REG_MAXPQSIZETXSEL_22_RT_OFFSET 29667 -#define QM_REG_MAXPQSIZETXSEL_23_RT_OFFSET 29668 -#define QM_REG_MAXPQSIZETXSEL_24_RT_OFFSET 29669 -#define QM_REG_MAXPQSIZETXSEL_25_RT_OFFSET 29670 -#define QM_REG_MAXPQSIZETXSEL_26_RT_OFFSET 29671 -#define QM_REG_MAXPQSIZETXSEL_27_RT_OFFSET 29672 -#define QM_REG_MAXPQSIZETXSEL_28_RT_OFFSET 29673 -#define QM_REG_MAXPQSIZETXSEL_29_RT_OFFSET 29674 -#define QM_REG_MAXPQSIZETXSEL_30_RT_OFFSET 29675 -#define QM_REG_MAXPQSIZETXSEL_31_RT_OFFSET 29676 -#define QM_REG_MAXPQSIZETXSEL_32_RT_OFFSET 29677 -#define QM_REG_MAXPQSIZETXSEL_33_RT_OFFSET 29678 -#define QM_REG_MAXPQSIZETXSEL_34_RT_OFFSET 29679 -#define QM_REG_MAXPQSIZETXSEL_35_RT_OFFSET 29680 -#define QM_REG_MAXPQSIZETXSEL_36_RT_OFFSET 29681 -#define QM_REG_MAXPQSIZETXSEL_37_RT_OFFSET 29682 -#define QM_REG_MAXPQSIZETXSEL_38_RT_OFFSET 29683 -#define QM_REG_MAXPQSIZETXSEL_39_RT_OFFSET 29684 -#define QM_REG_MAXPQSIZETXSEL_40_RT_OFFSET 29685 -#define QM_REG_MAXPQSIZETXSEL_41_RT_OFFSET 29686 -#define QM_REG_MAXPQSIZETXSEL_42_RT_OFFSET 29687 -#define QM_REG_MAXPQSIZETXSEL_43_RT_OFFSET 29688 -#define QM_REG_MAXPQSIZETXSEL_44_RT_OFFSET 29689 -#define QM_REG_MAXPQSIZETXSEL_45_RT_OFFSET 29690 -#define QM_REG_MAXPQSIZETXSEL_46_RT_OFFSET 29691 -#define QM_REG_MAXPQSIZETXSEL_47_RT_OFFSET 29692 -#define QM_REG_MAXPQSIZETXSEL_48_RT_OFFSET 29693 -#define QM_REG_MAXPQSIZETXSEL_49_RT_OFFSET 29694 -#define QM_REG_MAXPQSIZETXSEL_50_RT_OFFSET 29695 -#define QM_REG_MAXPQSIZETXSEL_51_RT_OFFSET 29696 -#define QM_REG_MAXPQSIZETXSEL_52_RT_OFFSET 29697 -#define QM_REG_MAXPQSIZETXSEL_53_RT_OFFSET 29698 -#define QM_REG_MAXPQSIZETXSEL_54_RT_OFFSET 29699 -#define QM_REG_MAXPQSIZETXSEL_55_RT_OFFSET 29700 -#define QM_REG_MAXPQSIZETXSEL_56_RT_OFFSET 29701 -#define QM_REG_MAXPQSIZETXSEL_57_RT_OFFSET 29702 -#define QM_REG_MAXPQSIZETXSEL_58_RT_OFFSET 29703 -#define QM_REG_MAXPQSIZETXSEL_59_RT_OFFSET 29704 -#define QM_REG_MAXPQSIZETXSEL_60_RT_OFFSET 29705 -#define QM_REG_MAXPQSIZETXSEL_61_RT_OFFSET 29706 -#define QM_REG_MAXPQSIZETXSEL_62_RT_OFFSET 29707 -#define QM_REG_MAXPQSIZETXSEL_63_RT_OFFSET 29708 -#define QM_REG_BASEADDROTHERPQ_RT_OFFSET 29709 -#define QM_REG_BASEADDROTHERPQ_RT_SIZE 128 -#define QM_REG_VOQCRDLINE_RT_OFFSET 29837 -#define QM_REG_VOQCRDLINE_RT_SIZE 20 -#define QM_REG_VOQINITCRDLINE_RT_OFFSET 29857 -#define QM_REG_VOQINITCRDLINE_RT_SIZE 20 -#define QM_REG_AFULLQMBYPTHRPFWFQ_RT_OFFSET 29877 -#define QM_REG_AFULLQMBYPTHRVPWFQ_RT_OFFSET 29878 -#define QM_REG_AFULLQMBYPTHRPFRL_RT_OFFSET 29879 -#define QM_REG_AFULLQMBYPTHRGLBLRL_RT_OFFSET 29880 -#define QM_REG_AFULLOPRTNSTCCRDMASK_RT_OFFSET 29881 -#define QM_REG_WRROTHERPQGRP_0_RT_OFFSET 29882 -#define QM_REG_WRROTHERPQGRP_1_RT_OFFSET 29883 -#define QM_REG_WRROTHERPQGRP_2_RT_OFFSET 29884 -#define QM_REG_WRROTHERPQGRP_3_RT_OFFSET 29885 -#define QM_REG_WRROTHERPQGRP_4_RT_OFFSET 29886 -#define QM_REG_WRROTHERPQGRP_5_RT_OFFSET 29887 -#define QM_REG_WRROTHERPQGRP_6_RT_OFFSET 29888 -#define QM_REG_WRROTHERPQGRP_7_RT_OFFSET 29889 -#define QM_REG_WRROTHERPQGRP_8_RT_OFFSET 29890 -#define QM_REG_WRROTHERPQGRP_9_RT_OFFSET 29891 -#define QM_REG_WRROTHERPQGRP_10_RT_OFFSET 29892 -#define QM_REG_WRROTHERPQGRP_11_RT_OFFSET 29893 -#define QM_REG_WRROTHERPQGRP_12_RT_OFFSET 29894 -#define QM_REG_WRROTHERPQGRP_13_RT_OFFSET 29895 -#define QM_REG_WRROTHERPQGRP_14_RT_OFFSET 29896 -#define QM_REG_WRROTHERPQGRP_15_RT_OFFSET 29897 -#define QM_REG_WRROTHERGRPWEIGHT_0_RT_OFFSET 29898 -#define QM_REG_WRROTHERGRPWEIGHT_1_RT_OFFSET 29899 -#define QM_REG_WRROTHERGRPWEIGHT_2_RT_OFFSET 29900 -#define QM_REG_WRROTHERGRPWEIGHT_3_RT_OFFSET 29901 -#define QM_REG_WRRTXGRPWEIGHT_0_RT_OFFSET 29902 -#define QM_REG_WRRTXGRPWEIGHT_1_RT_OFFSET 29903 -#define QM_REG_PQTX2PF_0_RT_OFFSET 29904 -#define QM_REG_PQTX2PF_1_RT_OFFSET 29905 -#define QM_REG_PQTX2PF_2_RT_OFFSET 29906 -#define QM_REG_PQTX2PF_3_RT_OFFSET 29907 -#define QM_REG_PQTX2PF_4_RT_OFFSET 29908 -#define QM_REG_PQTX2PF_5_RT_OFFSET 29909 -#define QM_REG_PQTX2PF_6_RT_OFFSET 29910 -#define QM_REG_PQTX2PF_7_RT_OFFSET 29911 -#define QM_REG_PQTX2PF_8_RT_OFFSET 29912 -#define QM_REG_PQTX2PF_9_RT_OFFSET 29913 -#define QM_REG_PQTX2PF_10_RT_OFFSET 29914 -#define QM_REG_PQTX2PF_11_RT_OFFSET 29915 -#define QM_REG_PQTX2PF_12_RT_OFFSET 29916 -#define QM_REG_PQTX2PF_13_RT_OFFSET 29917 -#define QM_REG_PQTX2PF_14_RT_OFFSET 29918 -#define QM_REG_PQTX2PF_15_RT_OFFSET 29919 -#define QM_REG_PQTX2PF_16_RT_OFFSET 29920 -#define QM_REG_PQTX2PF_17_RT_OFFSET 29921 -#define QM_REG_PQTX2PF_18_RT_OFFSET 29922 -#define QM_REG_PQTX2PF_19_RT_OFFSET 29923 -#define QM_REG_PQTX2PF_20_RT_OFFSET 29924 -#define QM_REG_PQTX2PF_21_RT_OFFSET 29925 -#define QM_REG_PQTX2PF_22_RT_OFFSET 29926 -#define QM_REG_PQTX2PF_23_RT_OFFSET 29927 -#define QM_REG_PQTX2PF_24_RT_OFFSET 29928 -#define QM_REG_PQTX2PF_25_RT_OFFSET 29929 -#define QM_REG_PQTX2PF_26_RT_OFFSET 29930 -#define QM_REG_PQTX2PF_27_RT_OFFSET 29931 -#define QM_REG_PQTX2PF_28_RT_OFFSET 29932 -#define QM_REG_PQTX2PF_29_RT_OFFSET 29933 -#define QM_REG_PQTX2PF_30_RT_OFFSET 29934 -#define QM_REG_PQTX2PF_31_RT_OFFSET 29935 -#define QM_REG_PQTX2PF_32_RT_OFFSET 29936 -#define QM_REG_PQTX2PF_33_RT_OFFSET 29937 -#define QM_REG_PQTX2PF_34_RT_OFFSET 29938 -#define QM_REG_PQTX2PF_35_RT_OFFSET 29939 -#define QM_REG_PQTX2PF_36_RT_OFFSET 29940 -#define QM_REG_PQTX2PF_37_RT_OFFSET 29941 -#define QM_REG_PQTX2PF_38_RT_OFFSET 29942 -#define QM_REG_PQTX2PF_39_RT_OFFSET 29943 -#define QM_REG_PQTX2PF_40_RT_OFFSET 29944 -#define QM_REG_PQTX2PF_41_RT_OFFSET 29945 -#define QM_REG_PQTX2PF_42_RT_OFFSET 29946 -#define QM_REG_PQTX2PF_43_RT_OFFSET 29947 -#define QM_REG_PQTX2PF_44_RT_OFFSET 29948 -#define QM_REG_PQTX2PF_45_RT_OFFSET 29949 -#define QM_REG_PQTX2PF_46_RT_OFFSET 29950 -#define QM_REG_PQTX2PF_47_RT_OFFSET 29951 -#define QM_REG_PQTX2PF_48_RT_OFFSET 29952 -#define QM_REG_PQTX2PF_49_RT_OFFSET 29953 -#define QM_REG_PQTX2PF_50_RT_OFFSET 29954 -#define QM_REG_PQTX2PF_51_RT_OFFSET 29955 -#define QM_REG_PQTX2PF_52_RT_OFFSET 29956 -#define QM_REG_PQTX2PF_53_RT_OFFSET 29957 -#define QM_REG_PQTX2PF_54_RT_OFFSET 29958 -#define QM_REG_PQTX2PF_55_RT_OFFSET 29959 -#define QM_REG_PQTX2PF_56_RT_OFFSET 29960 -#define QM_REG_PQTX2PF_57_RT_OFFSET 29961 -#define QM_REG_PQTX2PF_58_RT_OFFSET 29962 -#define QM_REG_PQTX2PF_59_RT_OFFSET 29963 -#define QM_REG_PQTX2PF_60_RT_OFFSET 29964 -#define QM_REG_PQTX2PF_61_RT_OFFSET 29965 -#define QM_REG_PQTX2PF_62_RT_OFFSET 29966 -#define QM_REG_PQTX2PF_63_RT_OFFSET 29967 -#define QM_REG_PQOTHER2PF_0_RT_OFFSET 29968 -#define QM_REG_PQOTHER2PF_1_RT_OFFSET 29969 -#define QM_REG_PQOTHER2PF_2_RT_OFFSET 29970 -#define QM_REG_PQOTHER2PF_3_RT_OFFSET 29971 -#define QM_REG_PQOTHER2PF_4_RT_OFFSET 29972 -#define QM_REG_PQOTHER2PF_5_RT_OFFSET 29973 -#define QM_REG_PQOTHER2PF_6_RT_OFFSET 29974 -#define QM_REG_PQOTHER2PF_7_RT_OFFSET 29975 -#define QM_REG_PQOTHER2PF_8_RT_OFFSET 29976 -#define QM_REG_PQOTHER2PF_9_RT_OFFSET 29977 -#define QM_REG_PQOTHER2PF_10_RT_OFFSET 29978 -#define QM_REG_PQOTHER2PF_11_RT_OFFSET 29979 -#define QM_REG_PQOTHER2PF_12_RT_OFFSET 29980 -#define QM_REG_PQOTHER2PF_13_RT_OFFSET 29981 -#define QM_REG_PQOTHER2PF_14_RT_OFFSET 29982 -#define QM_REG_PQOTHER2PF_15_RT_OFFSET 29983 -#define QM_REG_RLGLBLPERIOD_0_RT_OFFSET 29984 -#define QM_REG_RLGLBLPERIOD_1_RT_OFFSET 29985 -#define QM_REG_RLGLBLPERIODTIMER_0_RT_OFFSET 29986 -#define QM_REG_RLGLBLPERIODTIMER_1_RT_OFFSET 29987 -#define QM_REG_RLGLBLPERIODSEL_0_RT_OFFSET 29988 -#define QM_REG_RLGLBLPERIODSEL_1_RT_OFFSET 29989 -#define QM_REG_RLGLBLPERIODSEL_2_RT_OFFSET 29990 -#define QM_REG_RLGLBLPERIODSEL_3_RT_OFFSET 29991 -#define QM_REG_RLGLBLPERIODSEL_4_RT_OFFSET 29992 -#define QM_REG_RLGLBLPERIODSEL_5_RT_OFFSET 29993 -#define QM_REG_RLGLBLPERIODSEL_6_RT_OFFSET 29994 -#define QM_REG_RLGLBLPERIODSEL_7_RT_OFFSET 29995 -#define QM_REG_RLGLBLINCVAL_RT_OFFSET 29996 -#define QM_REG_RLGLBLINCVAL_RT_SIZE 256 -#define QM_REG_RLGLBLUPPERBOUND_RT_OFFSET 30252 -#define QM_REG_RLGLBLUPPERBOUND_RT_SIZE 256 -#define QM_REG_RLGLBLCRD_RT_OFFSET 30508 -#define QM_REG_RLGLBLCRD_RT_SIZE 256 -#define QM_REG_RLGLBLENABLE_RT_OFFSET 30764 -#define QM_REG_RLPFPERIOD_RT_OFFSET 30765 -#define QM_REG_RLPFPERIODTIMER_RT_OFFSET 30766 -#define QM_REG_RLPFINCVAL_RT_OFFSET 30767 -#define QM_REG_RLPFINCVAL_RT_SIZE 16 -#define QM_REG_RLPFUPPERBOUND_RT_OFFSET 30783 -#define QM_REG_RLPFUPPERBOUND_RT_SIZE 16 -#define QM_REG_RLPFCRD_RT_OFFSET 30799 -#define QM_REG_RLPFCRD_RT_SIZE 16 -#define QM_REG_RLPFENABLE_RT_OFFSET 30815 -#define QM_REG_RLPFVOQENABLE_RT_OFFSET 30816 -#define QM_REG_WFQPFWEIGHT_RT_OFFSET 30817 -#define QM_REG_WFQPFWEIGHT_RT_SIZE 16 -#define QM_REG_WFQPFUPPERBOUND_RT_OFFSET 30833 -#define QM_REG_WFQPFUPPERBOUND_RT_SIZE 16 -#define QM_REG_WFQPFCRD_RT_OFFSET 30849 -#define QM_REG_WFQPFCRD_RT_SIZE 160 -#define QM_REG_WFQPFENABLE_RT_OFFSET 31009 -#define QM_REG_WFQVPENABLE_RT_OFFSET 31010 -#define QM_REG_BASEADDRTXPQ_RT_OFFSET 31011 -#define QM_REG_BASEADDRTXPQ_RT_SIZE 512 -#define QM_REG_TXPQMAP_RT_OFFSET 31523 -#define QM_REG_TXPQMAP_RT_SIZE 512 -#define QM_REG_WFQVPWEIGHT_RT_OFFSET 32035 -#define QM_REG_WFQVPWEIGHT_RT_SIZE 512 -#define QM_REG_WFQVPCRD_RT_OFFSET 32547 -#define QM_REG_WFQVPCRD_RT_SIZE 512 -#define QM_REG_WFQVPMAP_RT_OFFSET 33059 -#define QM_REG_WFQVPMAP_RT_SIZE 512 -#define QM_REG_WFQPFCRD_MSB_RT_OFFSET 33571 -#define QM_REG_WFQPFCRD_MSB_RT_SIZE 160 -#define NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET 33731 -#define NIG_REG_OUTER_TAG_VALUE_LIST0_RT_OFFSET 33732 -#define NIG_REG_OUTER_TAG_VALUE_LIST1_RT_OFFSET 33733 -#define NIG_REG_OUTER_TAG_VALUE_LIST2_RT_OFFSET 33734 -#define NIG_REG_OUTER_TAG_VALUE_LIST3_RT_OFFSET 33735 -#define NIG_REG_OUTER_TAG_VALUE_MASK_RT_OFFSET 33736 -#define NIG_REG_LLH_FUNC_TAGMAC_CLS_TYPE_RT_OFFSET 33737 -#define NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET 33738 -#define NIG_REG_LLH_FUNC_TAG_EN_RT_SIZE 4 -#define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_OFFSET 33742 -#define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_SIZE 4 -#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET 33746 -#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_SIZE 4 -#define NIG_REG_LLH_FUNC_NO_TAG_RT_OFFSET 33750 -#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_OFFSET 33751 -#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_SIZE 32 -#define NIG_REG_LLH_FUNC_FILTER_EN_RT_OFFSET 33783 -#define NIG_REG_LLH_FUNC_FILTER_EN_RT_SIZE 16 -#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_OFFSET 33799 -#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_SIZE 16 -#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET 33815 -#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE 16 -#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET 33831 -#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_SIZE 16 -#define NIG_REG_TX_EDPM_CTRL_RT_OFFSET 33847 -#define NIG_REG_ROCE_DUPLICATE_TO_HOST_RT_OFFSET 33848 -#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET 33849 -#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET 33850 -#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET 33851 -#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET 33852 -#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET 33853 -#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET 33854 -#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET 33855 -#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET 33856 -#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET 33857 -#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET 33858 -#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET 33859 -#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET 33860 -#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET 33861 -#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET 33862 -#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET 33863 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET 33864 -#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET 33865 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET 33866 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET 33867 -#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET 33868 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET 33869 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET 33870 -#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET 33871 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET 33872 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET 33873 -#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET 33874 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET 33875 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET 33876 -#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET 33877 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET 33878 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET 33879 -#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET 33880 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET 33881 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET 33882 -#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET 33883 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET 33884 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET 33885 -#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET 33886 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET 33887 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET 33888 -#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET 33889 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET 33890 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET 33891 -#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET 33892 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET 33893 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET 33894 -#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET 33895 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET 33896 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET 33897 -#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET 33898 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET 33899 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET 33900 -#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET 33901 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET 33902 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET 33903 -#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET 33904 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET 33905 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET 33906 -#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET 33907 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET 33908 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET 33909 -#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET 33910 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET 33911 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET 33912 -#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET 33913 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET 33914 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET 33915 -#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET 33916 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET 33917 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET 33918 -#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET 33919 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET 33920 -#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET 33921 -#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET 33922 -#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET 33923 -#define XCM_REG_CON_PHY_Q3_RT_OFFSET 33924 - -#define RUNTIME_ARRAY_SIZE 33925 +#define DORQ_REG_PF_MAX_ICID_0_RT_OFFSET 0 +#define DORQ_REG_PF_MAX_ICID_1_RT_OFFSET 1 +#define DORQ_REG_PF_MAX_ICID_2_RT_OFFSET 2 +#define DORQ_REG_PF_MAX_ICID_3_RT_OFFSET 3 +#define DORQ_REG_PF_MAX_ICID_4_RT_OFFSET 4 +#define DORQ_REG_PF_MAX_ICID_5_RT_OFFSET 5 +#define DORQ_REG_PF_MAX_ICID_6_RT_OFFSET 6 +#define DORQ_REG_PF_MAX_ICID_7_RT_OFFSET 7 +#define DORQ_REG_VF_MAX_ICID_0_RT_OFFSET 8 +#define DORQ_REG_VF_MAX_ICID_1_RT_OFFSET 9 +#define DORQ_REG_VF_MAX_ICID_2_RT_OFFSET 10 +#define DORQ_REG_VF_MAX_ICID_3_RT_OFFSET 11 +#define DORQ_REG_VF_MAX_ICID_4_RT_OFFSET 12 +#define DORQ_REG_VF_MAX_ICID_5_RT_OFFSET 13 +#define DORQ_REG_VF_MAX_ICID_6_RT_OFFSET 14 +#define DORQ_REG_VF_MAX_ICID_7_RT_OFFSET 15 +#define DORQ_REG_PF_WAKE_ALL_RT_OFFSET 16 +#define DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET 17 +#define IGU_REG_PF_CONFIGURATION_RT_OFFSET 18 +#define IGU_REG_VF_CONFIGURATION_RT_OFFSET 19 +#define IGU_REG_ATTN_MSG_ADDR_L_RT_OFFSET 20 +#define IGU_REG_ATTN_MSG_ADDR_H_RT_OFFSET 21 +#define IGU_REG_LEADING_EDGE_LATCH_RT_OFFSET 22 +#define IGU_REG_TRAILING_EDGE_LATCH_RT_OFFSET 23 +#define CAU_REG_CQE_AGG_UNIT_SIZE_RT_OFFSET 24 +#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET 761 +#define CAU_REG_SB_VAR_MEMORY_RT_SIZE 736 +#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET 761 +#define CAU_REG_SB_VAR_MEMORY_RT_SIZE 736 +#define CAU_REG_SB_ADDR_MEMORY_RT_OFFSET 1497 +#define CAU_REG_SB_ADDR_MEMORY_RT_SIZE 736 +#define CAU_REG_PI_MEMORY_RT_OFFSET 2233 +#define CAU_REG_PI_MEMORY_RT_SIZE 4416 +#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET 6649 +#define PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET 6650 +#define PRS_REG_TASK_ID_MAX_INITIATOR_VF_RT_OFFSET 6651 +#define PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET 6652 +#define PRS_REG_TASK_ID_MAX_TARGET_VF_RT_OFFSET 6653 +#define PRS_REG_SEARCH_TCP_RT_OFFSET 6654 +#define PRS_REG_SEARCH_FCOE_RT_OFFSET 6655 +#define PRS_REG_SEARCH_ROCE_RT_OFFSET 6656 +#define PRS_REG_ROCE_DEST_QP_MAX_VF_RT_OFFSET 6657 +#define PRS_REG_ROCE_DEST_QP_MAX_PF_RT_OFFSET 6658 +#define PRS_REG_SEARCH_OPENFLOW_RT_OFFSET 6659 +#define PRS_REG_SEARCH_NON_IP_AS_OPENFLOW_RT_OFFSET 6660 +#define PRS_REG_OPENFLOW_SUPPORT_ONLY_KNOWN_OVER_IP_RT_OFFSET 6661 +#define PRS_REG_OPENFLOW_SEARCH_KEY_MASK_RT_OFFSET 6662 +#define PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET 6663 +#define PRS_REG_LIGHT_L2_ETHERTYPE_EN_RT_OFFSET 6664 +#define SRC_REG_FIRSTFREE_RT_OFFSET 6665 +#define SRC_REG_FIRSTFREE_RT_SIZE 2 +#define SRC_REG_LASTFREE_RT_OFFSET 6667 +#define SRC_REG_LASTFREE_RT_SIZE 2 +#define SRC_REG_COUNTFREE_RT_OFFSET 6669 +#define SRC_REG_NUMBER_HASH_BITS_RT_OFFSET 6670 +#define PSWRQ2_REG_CDUT_P_SIZE_RT_OFFSET 6671 +#define PSWRQ2_REG_CDUC_P_SIZE_RT_OFFSET 6672 +#define PSWRQ2_REG_TM_P_SIZE_RT_OFFSET 6673 +#define PSWRQ2_REG_QM_P_SIZE_RT_OFFSET 6674 +#define PSWRQ2_REG_SRC_P_SIZE_RT_OFFSET 6675 +#define PSWRQ2_REG_TSDM_P_SIZE_RT_OFFSET 6676 +#define PSWRQ2_REG_TM_FIRST_ILT_RT_OFFSET 6677 +#define PSWRQ2_REG_TM_LAST_ILT_RT_OFFSET 6678 +#define PSWRQ2_REG_QM_FIRST_ILT_RT_OFFSET 6679 +#define PSWRQ2_REG_QM_LAST_ILT_RT_OFFSET 6680 +#define PSWRQ2_REG_SRC_FIRST_ILT_RT_OFFSET 6681 +#define PSWRQ2_REG_SRC_LAST_ILT_RT_OFFSET 6682 +#define PSWRQ2_REG_CDUC_FIRST_ILT_RT_OFFSET 6683 +#define PSWRQ2_REG_CDUC_LAST_ILT_RT_OFFSET 6684 +#define PSWRQ2_REG_CDUT_FIRST_ILT_RT_OFFSET 6685 +#define PSWRQ2_REG_CDUT_LAST_ILT_RT_OFFSET 6686 +#define PSWRQ2_REG_TSDM_FIRST_ILT_RT_OFFSET 6687 +#define PSWRQ2_REG_TSDM_LAST_ILT_RT_OFFSET 6688 +#define PSWRQ2_REG_TM_NUMBER_OF_PF_BLOCKS_RT_OFFSET 6689 +#define PSWRQ2_REG_CDUT_NUMBER_OF_PF_BLOCKS_RT_OFFSET 6690 +#define PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET 6691 +#define PSWRQ2_REG_TM_VF_BLOCKS_RT_OFFSET 6692 +#define PSWRQ2_REG_CDUT_VF_BLOCKS_RT_OFFSET 6693 +#define PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET 6694 +#define PSWRQ2_REG_TM_BLOCKS_FACTOR_RT_OFFSET 6695 +#define PSWRQ2_REG_CDUT_BLOCKS_FACTOR_RT_OFFSET 6696 +#define PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET 6697 +#define PSWRQ2_REG_VF_BASE_RT_OFFSET 6698 +#define PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET 6699 +#define PSWRQ2_REG_WR_MBS0_RT_OFFSET 6700 +#define PSWRQ2_REG_RD_MBS0_RT_OFFSET 6701 +#define PSWRQ2_REG_DRAM_ALIGN_WR_RT_OFFSET 6702 +#define PSWRQ2_REG_DRAM_ALIGN_RD_RT_OFFSET 6703 +#define PSWRQ2_REG_ILT_MEMORY_RT_OFFSET 6704 +#define PSWRQ2_REG_ILT_MEMORY_RT_SIZE 22000 +#define PGLUE_REG_B_VF_BASE_RT_OFFSET 28704 +#define PGLUE_REG_B_MSDM_OFFSET_MASK_B_RT_OFFSET 28705 +#define PGLUE_REG_B_MSDM_VF_SHIFT_B_RT_OFFSET 28706 +#define PGLUE_REG_B_CACHE_LINE_SIZE_RT_OFFSET 28707 +#define PGLUE_REG_B_PF_BAR0_SIZE_RT_OFFSET 28708 +#define PGLUE_REG_B_PF_BAR1_SIZE_RT_OFFSET 28709 +#define PGLUE_REG_B_VF_BAR1_SIZE_RT_OFFSET 28710 +#define TM_REG_VF_ENABLE_CONN_RT_OFFSET 28711 +#define TM_REG_PF_ENABLE_CONN_RT_OFFSET 28712 +#define TM_REG_PF_ENABLE_TASK_RT_OFFSET 28713 +#define TM_REG_GROUP_SIZE_RESOLUTION_CONN_RT_OFFSET 28714 +#define TM_REG_GROUP_SIZE_RESOLUTION_TASK_RT_OFFSET 28715 +#define TM_REG_CONFIG_CONN_MEM_RT_OFFSET 28716 +#define TM_REG_CONFIG_CONN_MEM_RT_SIZE 416 +#define TM_REG_CONFIG_TASK_MEM_RT_OFFSET 29132 +#define TM_REG_CONFIG_TASK_MEM_RT_SIZE 512 +#define QM_REG_MAXPQSIZE_0_RT_OFFSET 29644 +#define QM_REG_MAXPQSIZE_1_RT_OFFSET 29645 +#define QM_REG_MAXPQSIZE_2_RT_OFFSET 29646 +#define QM_REG_MAXPQSIZETXSEL_0_RT_OFFSET 29647 +#define QM_REG_MAXPQSIZETXSEL_1_RT_OFFSET 29648 +#define QM_REG_MAXPQSIZETXSEL_2_RT_OFFSET 29649 +#define QM_REG_MAXPQSIZETXSEL_3_RT_OFFSET 29650 +#define QM_REG_MAXPQSIZETXSEL_4_RT_OFFSET 29651 +#define QM_REG_MAXPQSIZETXSEL_5_RT_OFFSET 29652 +#define QM_REG_MAXPQSIZETXSEL_6_RT_OFFSET 29653 +#define QM_REG_MAXPQSIZETXSEL_7_RT_OFFSET 29654 +#define QM_REG_MAXPQSIZETXSEL_8_RT_OFFSET 29655 +#define QM_REG_MAXPQSIZETXSEL_9_RT_OFFSET 29656 +#define QM_REG_MAXPQSIZETXSEL_10_RT_OFFSET 29657 +#define QM_REG_MAXPQSIZETXSEL_11_RT_OFFSET 29658 +#define QM_REG_MAXPQSIZETXSEL_12_RT_OFFSET 29659 +#define QM_REG_MAXPQSIZETXSEL_13_RT_OFFSET 29660 +#define QM_REG_MAXPQSIZETXSEL_14_RT_OFFSET 29661 +#define QM_REG_MAXPQSIZETXSEL_15_RT_OFFSET 29662 +#define QM_REG_MAXPQSIZETXSEL_16_RT_OFFSET 29663 +#define QM_REG_MAXPQSIZETXSEL_17_RT_OFFSET 29664 +#define QM_REG_MAXPQSIZETXSEL_18_RT_OFFSET 29665 +#define QM_REG_MAXPQSIZETXSEL_19_RT_OFFSET 29666 +#define QM_REG_MAXPQSIZETXSEL_20_RT_OFFSET 29667 +#define QM_REG_MAXPQSIZETXSEL_21_RT_OFFSET 29668 +#define QM_REG_MAXPQSIZETXSEL_22_RT_OFFSET 29669 +#define QM_REG_MAXPQSIZETXSEL_23_RT_OFFSET 29670 +#define QM_REG_MAXPQSIZETXSEL_24_RT_OFFSET 29671 +#define QM_REG_MAXPQSIZETXSEL_25_RT_OFFSET 29672 +#define QM_REG_MAXPQSIZETXSEL_26_RT_OFFSET 29673 +#define QM_REG_MAXPQSIZETXSEL_27_RT_OFFSET 29674 +#define QM_REG_MAXPQSIZETXSEL_28_RT_OFFSET 29675 +#define QM_REG_MAXPQSIZETXSEL_29_RT_OFFSET 29676 +#define QM_REG_MAXPQSIZETXSEL_30_RT_OFFSET 29677 +#define QM_REG_MAXPQSIZETXSEL_31_RT_OFFSET 29678 +#define QM_REG_MAXPQSIZETXSEL_32_RT_OFFSET 29679 +#define QM_REG_MAXPQSIZETXSEL_33_RT_OFFSET 29680 +#define QM_REG_MAXPQSIZETXSEL_34_RT_OFFSET 29681 +#define QM_REG_MAXPQSIZETXSEL_35_RT_OFFSET 29682 +#define QM_REG_MAXPQSIZETXSEL_36_RT_OFFSET 29683 +#define QM_REG_MAXPQSIZETXSEL_37_RT_OFFSET 29684 +#define QM_REG_MAXPQSIZETXSEL_38_RT_OFFSET 29685 +#define QM_REG_MAXPQSIZETXSEL_39_RT_OFFSET 29686 +#define QM_REG_MAXPQSIZETXSEL_40_RT_OFFSET 29687 +#define QM_REG_MAXPQSIZETXSEL_41_RT_OFFSET 29688 +#define QM_REG_MAXPQSIZETXSEL_42_RT_OFFSET 29689 +#define QM_REG_MAXPQSIZETXSEL_43_RT_OFFSET 29690 +#define QM_REG_MAXPQSIZETXSEL_44_RT_OFFSET 29691 +#define QM_REG_MAXPQSIZETXSEL_45_RT_OFFSET 29692 +#define QM_REG_MAXPQSIZETXSEL_46_RT_OFFSET 29693 +#define QM_REG_MAXPQSIZETXSEL_47_RT_OFFSET 29694 +#define QM_REG_MAXPQSIZETXSEL_48_RT_OFFSET 29695 +#define QM_REG_MAXPQSIZETXSEL_49_RT_OFFSET 29696 +#define QM_REG_MAXPQSIZETXSEL_50_RT_OFFSET 29697 +#define QM_REG_MAXPQSIZETXSEL_51_RT_OFFSET 29698 +#define QM_REG_MAXPQSIZETXSEL_52_RT_OFFSET 29699 +#define QM_REG_MAXPQSIZETXSEL_53_RT_OFFSET 29700 +#define QM_REG_MAXPQSIZETXSEL_54_RT_OFFSET 29701 +#define QM_REG_MAXPQSIZETXSEL_55_RT_OFFSET 29702 +#define QM_REG_MAXPQSIZETXSEL_56_RT_OFFSET 29703 +#define QM_REG_MAXPQSIZETXSEL_57_RT_OFFSET 29704 +#define QM_REG_MAXPQSIZETXSEL_58_RT_OFFSET 29705 +#define QM_REG_MAXPQSIZETXSEL_59_RT_OFFSET 29706 +#define QM_REG_MAXPQSIZETXSEL_60_RT_OFFSET 29707 +#define QM_REG_MAXPQSIZETXSEL_61_RT_OFFSET 29708 +#define QM_REG_MAXPQSIZETXSEL_62_RT_OFFSET 29709 +#define QM_REG_MAXPQSIZETXSEL_63_RT_OFFSET 29710 +#define QM_REG_BASEADDROTHERPQ_RT_OFFSET 29711 +#define QM_REG_BASEADDROTHERPQ_RT_SIZE 128 +#define QM_REG_VOQCRDLINE_RT_OFFSET 29839 +#define QM_REG_VOQCRDLINE_RT_SIZE 20 +#define QM_REG_VOQINITCRDLINE_RT_OFFSET 29859 +#define QM_REG_VOQINITCRDLINE_RT_SIZE 20 +#define QM_REG_AFULLQMBYPTHRPFWFQ_RT_OFFSET 29879 +#define QM_REG_AFULLQMBYPTHRVPWFQ_RT_OFFSET 29880 +#define QM_REG_AFULLQMBYPTHRPFRL_RT_OFFSET 29881 +#define QM_REG_AFULLQMBYPTHRGLBLRL_RT_OFFSET 29882 +#define QM_REG_AFULLOPRTNSTCCRDMASK_RT_OFFSET 29883 +#define QM_REG_WRROTHERPQGRP_0_RT_OFFSET 29884 +#define QM_REG_WRROTHERPQGRP_1_RT_OFFSET 29885 +#define QM_REG_WRROTHERPQGRP_2_RT_OFFSET 29886 +#define QM_REG_WRROTHERPQGRP_3_RT_OFFSET 29887 +#define QM_REG_WRROTHERPQGRP_4_RT_OFFSET 29888 +#define QM_REG_WRROTHERPQGRP_5_RT_OFFSET 29889 +#define QM_REG_WRROTHERPQGRP_6_RT_OFFSET 29890 +#define QM_REG_WRROTHERPQGRP_7_RT_OFFSET 29891 +#define QM_REG_WRROTHERPQGRP_8_RT_OFFSET 29892 +#define QM_REG_WRROTHERPQGRP_9_RT_OFFSET 29893 +#define QM_REG_WRROTHERPQGRP_10_RT_OFFSET 29894 +#define QM_REG_WRROTHERPQGRP_11_RT_OFFSET 29895 +#define QM_REG_WRROTHERPQGRP_12_RT_OFFSET 29896 +#define QM_REG_WRROTHERPQGRP_13_RT_OFFSET 29897 +#define QM_REG_WRROTHERPQGRP_14_RT_OFFSET 29898 +#define QM_REG_WRROTHERPQGRP_15_RT_OFFSET 29899 +#define QM_REG_WRROTHERGRPWEIGHT_0_RT_OFFSET 29900 +#define QM_REG_WRROTHERGRPWEIGHT_1_RT_OFFSET 29901 +#define QM_REG_WRROTHERGRPWEIGHT_2_RT_OFFSET 29902 +#define QM_REG_WRROTHERGRPWEIGHT_3_RT_OFFSET 29903 +#define QM_REG_WRRTXGRPWEIGHT_0_RT_OFFSET 29904 +#define QM_REG_WRRTXGRPWEIGHT_1_RT_OFFSET 29905 +#define QM_REG_PQTX2PF_0_RT_OFFSET 29906 +#define QM_REG_PQTX2PF_1_RT_OFFSET 29907 +#define QM_REG_PQTX2PF_2_RT_OFFSET 29908 +#define QM_REG_PQTX2PF_3_RT_OFFSET 29909 +#define QM_REG_PQTX2PF_4_RT_OFFSET 29910 +#define QM_REG_PQTX2PF_5_RT_OFFSET 29911 +#define QM_REG_PQTX2PF_6_RT_OFFSET 29912 +#define QM_REG_PQTX2PF_7_RT_OFFSET 29913 +#define QM_REG_PQTX2PF_8_RT_OFFSET 29914 +#define QM_REG_PQTX2PF_9_RT_OFFSET 29915 +#define QM_REG_PQTX2PF_10_RT_OFFSET 29916 +#define QM_REG_PQTX2PF_11_RT_OFFSET 29917 +#define QM_REG_PQTX2PF_12_RT_OFFSET 29918 +#define QM_REG_PQTX2PF_13_RT_OFFSET 29919 +#define QM_REG_PQTX2PF_14_RT_OFFSET 29920 +#define QM_REG_PQTX2PF_15_RT_OFFSET 29921 +#define QM_REG_PQTX2PF_16_RT_OFFSET 29922 +#define QM_REG_PQTX2PF_17_RT_OFFSET 29923 +#define QM_REG_PQTX2PF_18_RT_OFFSET 29924 +#define QM_REG_PQTX2PF_19_RT_OFFSET 29925 +#define QM_REG_PQTX2PF_20_RT_OFFSET 29926 +#define QM_REG_PQTX2PF_21_RT_OFFSET 29927 +#define QM_REG_PQTX2PF_22_RT_OFFSET 29928 +#define QM_REG_PQTX2PF_23_RT_OFFSET 29929 +#define QM_REG_PQTX2PF_24_RT_OFFSET 29930 +#define QM_REG_PQTX2PF_25_RT_OFFSET 29931 +#define QM_REG_PQTX2PF_26_RT_OFFSET 29932 +#define QM_REG_PQTX2PF_27_RT_OFFSET 29933 +#define QM_REG_PQTX2PF_28_RT_OFFSET 29934 +#define QM_REG_PQTX2PF_29_RT_OFFSET 29935 +#define QM_REG_PQTX2PF_30_RT_OFFSET 29936 +#define QM_REG_PQTX2PF_31_RT_OFFSET 29937 +#define QM_REG_PQTX2PF_32_RT_OFFSET 29938 +#define QM_REG_PQTX2PF_33_RT_OFFSET 29939 +#define QM_REG_PQTX2PF_34_RT_OFFSET 29940 +#define QM_REG_PQTX2PF_35_RT_OFFSET 29941 +#define QM_REG_PQTX2PF_36_RT_OFFSET 29942 +#define QM_REG_PQTX2PF_37_RT_OFFSET 29943 +#define QM_REG_PQTX2PF_38_RT_OFFSET 29944 +#define QM_REG_PQTX2PF_39_RT_OFFSET 29945 +#define QM_REG_PQTX2PF_40_RT_OFFSET 29946 +#define QM_REG_PQTX2PF_41_RT_OFFSET 29947 +#define QM_REG_PQTX2PF_42_RT_OFFSET 29948 +#define QM_REG_PQTX2PF_43_RT_OFFSET 29949 +#define QM_REG_PQTX2PF_44_RT_OFFSET 29950 +#define QM_REG_PQTX2PF_45_RT_OFFSET 29951 +#define QM_REG_PQTX2PF_46_RT_OFFSET 29952 +#define QM_REG_PQTX2PF_47_RT_OFFSET 29953 +#define QM_REG_PQTX2PF_48_RT_OFFSET 29954 +#define QM_REG_PQTX2PF_49_RT_OFFSET 29955 +#define QM_REG_PQTX2PF_50_RT_OFFSET 29956 +#define QM_REG_PQTX2PF_51_RT_OFFSET 29957 +#define QM_REG_PQTX2PF_52_RT_OFFSET 29958 +#define QM_REG_PQTX2PF_53_RT_OFFSET 29959 +#define QM_REG_PQTX2PF_54_RT_OFFSET 29960 +#define QM_REG_PQTX2PF_55_RT_OFFSET 29961 +#define QM_REG_PQTX2PF_56_RT_OFFSET 29962 +#define QM_REG_PQTX2PF_57_RT_OFFSET 29963 +#define QM_REG_PQTX2PF_58_RT_OFFSET 29964 +#define QM_REG_PQTX2PF_59_RT_OFFSET 29965 +#define QM_REG_PQTX2PF_60_RT_OFFSET 29966 +#define QM_REG_PQTX2PF_61_RT_OFFSET 29967 +#define QM_REG_PQTX2PF_62_RT_OFFSET 29968 +#define QM_REG_PQTX2PF_63_RT_OFFSET 29969 +#define QM_REG_PQOTHER2PF_0_RT_OFFSET 29970 +#define QM_REG_PQOTHER2PF_1_RT_OFFSET 29971 +#define QM_REG_PQOTHER2PF_2_RT_OFFSET 29972 +#define QM_REG_PQOTHER2PF_3_RT_OFFSET 29973 +#define QM_REG_PQOTHER2PF_4_RT_OFFSET 29974 +#define QM_REG_PQOTHER2PF_5_RT_OFFSET 29975 +#define QM_REG_PQOTHER2PF_6_RT_OFFSET 29976 +#define QM_REG_PQOTHER2PF_7_RT_OFFSET 29977 +#define QM_REG_PQOTHER2PF_8_RT_OFFSET 29978 +#define QM_REG_PQOTHER2PF_9_RT_OFFSET 29979 +#define QM_REG_PQOTHER2PF_10_RT_OFFSET 29980 +#define QM_REG_PQOTHER2PF_11_RT_OFFSET 29981 +#define QM_REG_PQOTHER2PF_12_RT_OFFSET 29982 +#define QM_REG_PQOTHER2PF_13_RT_OFFSET 29983 +#define QM_REG_PQOTHER2PF_14_RT_OFFSET 29984 +#define QM_REG_PQOTHER2PF_15_RT_OFFSET 29985 +#define QM_REG_RLGLBLPERIOD_0_RT_OFFSET 29986 +#define QM_REG_RLGLBLPERIOD_1_RT_OFFSET 29987 +#define QM_REG_RLGLBLPERIODTIMER_0_RT_OFFSET 29988 +#define QM_REG_RLGLBLPERIODTIMER_1_RT_OFFSET 29989 +#define QM_REG_RLGLBLPERIODSEL_0_RT_OFFSET 29990 +#define QM_REG_RLGLBLPERIODSEL_1_RT_OFFSET 29991 +#define QM_REG_RLGLBLPERIODSEL_2_RT_OFFSET 29992 +#define QM_REG_RLGLBLPERIODSEL_3_RT_OFFSET 29993 +#define QM_REG_RLGLBLPERIODSEL_4_RT_OFFSET 29994 +#define QM_REG_RLGLBLPERIODSEL_5_RT_OFFSET 29995 +#define QM_REG_RLGLBLPERIODSEL_6_RT_OFFSET 29996 +#define QM_REG_RLGLBLPERIODSEL_7_RT_OFFSET 29997 +#define QM_REG_RLGLBLINCVAL_RT_OFFSET 29998 +#define QM_REG_RLGLBLINCVAL_RT_SIZE 256 +#define QM_REG_RLGLBLUPPERBOUND_RT_OFFSET 30254 +#define QM_REG_RLGLBLUPPERBOUND_RT_SIZE 256 +#define QM_REG_RLGLBLCRD_RT_OFFSET 30510 +#define QM_REG_RLGLBLCRD_RT_SIZE 256 +#define QM_REG_RLGLBLENABLE_RT_OFFSET 30766 +#define QM_REG_RLPFPERIOD_RT_OFFSET 30767 +#define QM_REG_RLPFPERIODTIMER_RT_OFFSET 30768 +#define QM_REG_RLPFINCVAL_RT_OFFSET 30769 +#define QM_REG_RLPFINCVAL_RT_SIZE 16 +#define QM_REG_RLPFUPPERBOUND_RT_OFFSET 30785 +#define QM_REG_RLPFUPPERBOUND_RT_SIZE 16 +#define QM_REG_RLPFCRD_RT_OFFSET 30801 +#define QM_REG_RLPFCRD_RT_SIZE 16 +#define QM_REG_RLPFENABLE_RT_OFFSET 30817 +#define QM_REG_RLPFVOQENABLE_RT_OFFSET 30818 +#define QM_REG_WFQPFWEIGHT_RT_OFFSET 30819 +#define QM_REG_WFQPFWEIGHT_RT_SIZE 16 +#define QM_REG_WFQPFUPPERBOUND_RT_OFFSET 30835 +#define QM_REG_WFQPFUPPERBOUND_RT_SIZE 16 +#define QM_REG_WFQPFCRD_RT_OFFSET 30851 +#define QM_REG_WFQPFCRD_RT_SIZE 160 +#define QM_REG_WFQPFENABLE_RT_OFFSET 31011 +#define QM_REG_WFQVPENABLE_RT_OFFSET 31012 +#define QM_REG_BASEADDRTXPQ_RT_OFFSET 31013 +#define QM_REG_BASEADDRTXPQ_RT_SIZE 512 +#define QM_REG_TXPQMAP_RT_OFFSET 31525 +#define QM_REG_TXPQMAP_RT_SIZE 512 +#define QM_REG_WFQVPWEIGHT_RT_OFFSET 32037 +#define QM_REG_WFQVPWEIGHT_RT_SIZE 512 +#define QM_REG_WFQVPCRD_RT_OFFSET 32549 +#define QM_REG_WFQVPCRD_RT_SIZE 512 +#define QM_REG_WFQVPMAP_RT_OFFSET 33061 +#define QM_REG_WFQVPMAP_RT_SIZE 512 +#define QM_REG_WFQPFCRD_MSB_RT_OFFSET 33573 +#define QM_REG_WFQPFCRD_MSB_RT_SIZE 160 +#define NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET 33733 +#define NIG_REG_OUTER_TAG_VALUE_LIST0_RT_OFFSET 33734 +#define NIG_REG_OUTER_TAG_VALUE_LIST1_RT_OFFSET 33735 +#define NIG_REG_OUTER_TAG_VALUE_LIST2_RT_OFFSET 33736 +#define NIG_REG_OUTER_TAG_VALUE_LIST3_RT_OFFSET 33737 +#define NIG_REG_OUTER_TAG_VALUE_MASK_RT_OFFSET 33738 +#define NIG_REG_LLH_FUNC_TAGMAC_CLS_TYPE_RT_OFFSET 33739 +#define NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET 33740 +#define NIG_REG_LLH_FUNC_TAG_EN_RT_SIZE 4 +#define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_OFFSET 33744 +#define NIG_REG_LLH_FUNC_TAG_HDR_SEL_RT_SIZE 4 +#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET 33748 +#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_SIZE 4 +#define NIG_REG_LLH_FUNC_NO_TAG_RT_OFFSET 33752 +#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_OFFSET 33753 +#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_SIZE 32 +#define NIG_REG_LLH_FUNC_FILTER_EN_RT_OFFSET 33785 +#define NIG_REG_LLH_FUNC_FILTER_EN_RT_SIZE 16 +#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_OFFSET 33801 +#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_SIZE 16 +#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET 33817 +#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE 16 +#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET 33833 +#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_SIZE 16 +#define NIG_REG_TX_EDPM_CTRL_RT_OFFSET 33849 +#define NIG_REG_ROCE_DUPLICATE_TO_HOST_RT_OFFSET 33850 +#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET 33851 +#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET 33852 +#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET 33853 +#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET 33854 +#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET 33855 +#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET 33856 +#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET 33857 +#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET 33858 +#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET 33859 +#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET 33860 +#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET 33861 +#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET 33862 +#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET 33863 +#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET 33864 +#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET 33865 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET 33866 +#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET 33867 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET 33868 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET 33869 +#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET 33870 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET 33871 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET 33872 +#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET 33873 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET 33874 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET 33875 +#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET 33876 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET 33877 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET 33878 +#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET 33879 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET 33880 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET 33881 +#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET 33882 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET 33883 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET 33884 +#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET 33885 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET 33886 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET 33887 +#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET 33888 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET 33889 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET 33890 +#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET 33891 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET 33892 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET 33893 +#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET 33894 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET 33895 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET 33896 +#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET 33897 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET 33898 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET 33899 +#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET 33900 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET 33901 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET 33902 +#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET 33903 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET 33904 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET 33905 +#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET 33906 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET 33907 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET 33908 +#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET 33909 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET 33910 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET 33911 +#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET 33912 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET 33913 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET 33914 +#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET 33915 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET 33916 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET 33917 +#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET 33918 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET 33919 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET 33920 +#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET 33921 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET 33922 +#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET 33923 +#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET 33924 +#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET 33925 +#define XCM_REG_CON_PHY_Q3_RT_OFFSET 33926 + +#define RUNTIME_ARRAY_SIZE 33927 /* The eth storm context for the Tstorm */ struct tstorm_eth_conn_st_ctx { @@ -3201,7 +3566,31 @@ struct eth_conn_context { struct mstorm_eth_conn_st_ctx mstorm_st_context; }; -/* opcodes for the event ring */ +enum eth_error_code { + ETH_OK = 0x00, + ETH_FILTERS_MAC_ADD_FAIL_FULL, + ETH_FILTERS_MAC_ADD_FAIL_FULL_MTT2, + ETH_FILTERS_MAC_ADD_FAIL_DUP_MTT2, + ETH_FILTERS_MAC_ADD_FAIL_DUP_STT2, + ETH_FILTERS_MAC_DEL_FAIL_NOF, + ETH_FILTERS_MAC_DEL_FAIL_NOF_MTT2, + ETH_FILTERS_MAC_DEL_FAIL_NOF_STT2, + ETH_FILTERS_MAC_ADD_FAIL_ZERO_MAC, + ETH_FILTERS_VLAN_ADD_FAIL_FULL, + ETH_FILTERS_VLAN_ADD_FAIL_DUP, + ETH_FILTERS_VLAN_DEL_FAIL_NOF, + ETH_FILTERS_VLAN_DEL_FAIL_NOF_TT1, + ETH_FILTERS_PAIR_ADD_FAIL_DUP, + ETH_FILTERS_PAIR_ADD_FAIL_FULL, + ETH_FILTERS_PAIR_ADD_FAIL_FULL_MAC, + ETH_FILTERS_PAIR_DEL_FAIL_NOF, + ETH_FILTERS_PAIR_DEL_FAIL_NOF_TT1, + ETH_FILTERS_PAIR_ADD_FAIL_ZERO_MAC, + ETH_FILTERS_VNI_ADD_FAIL_FULL, + ETH_FILTERS_VNI_ADD_FAIL_DUP, + MAX_ETH_ERROR_CODE +}; + enum eth_event_opcode { ETH_EVENT_UNUSED, ETH_EVENT_VPORT_START, @@ -3269,7 +3658,13 @@ enum eth_filter_type { MAX_ETH_FILTER_TYPE }; -/* Ethernet Ramrod Command IDs */ +enum eth_ipv4_frag_type { + ETH_IPV4_NOT_FRAG, + ETH_IPV4_FIRST_FRAG, + ETH_IPV4_NON_FIRST_FRAG, + MAX_ETH_IPV4_FRAG_TYPE +}; + enum eth_ramrod_cmd_id { ETH_RAMROD_UNUSED, ETH_RAMROD_VPORT_START, @@ -3451,8 +3846,8 @@ struct rx_queue_start_ramrod_data { u8 toggle_val; u8 vf_rx_prod_index; - - u8 reserved[6]; + u8 vf_rx_prod_use_zone_a; + u8 reserved[5]; __le16 reserved1; struct regpair cqe_pbl_addr; struct regpair bd_base; @@ -3526,10 +3921,11 @@ struct tx_queue_start_ramrod_data { __le16 pxp_st_index; __le16 comp_agg_size; __le16 queue_zone_id; - __le16 test_dup_count; + __le16 reserved2; __le16 pbl_size; __le16 tx_queue_id; - + __le16 same_as_last_id; + __le16 reserved[3]; struct regpair pbl_base_addr; struct regpair bd_cons_address; }; @@ -4926,8 +5322,8 @@ struct roce_create_qp_resp_ramrod_data { #define ROCE_CREATE_QP_RESP_RAMROD_DATA_SRQ_FLG_SHIFT 5 #define ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN_MASK 0x1 #define ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN_SHIFT 6 -#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED0_MASK 0x1 -#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED0_SHIFT 7 +#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_KEY_EN_MASK 0x1 +#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_KEY_EN_SHIFT 7 #define ROCE_CREATE_QP_RESP_RAMROD_DATA_PRI_MASK 0x7 #define ROCE_CREATE_QP_RESP_RAMROD_DATA_PRI_SHIFT 8 #define ROCE_CREATE_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_MASK 0x1F @@ -4988,6 +5384,10 @@ enum roce_event_opcode { MAX_ROCE_EVENT_OPCODE }; +struct roce_init_func_ramrod_data { + struct rdma_init_func_ramrod_data rdma; +}; + struct roce_modify_qp_req_ramrod_data { __le16 flags; #define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_ERR_FLG_MASK 0x1 diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c index b7a4b27ebdbd..8ce8564061d5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c +++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c @@ -534,7 +534,7 @@ int qed_init_fw_data(struct qed_dev *cdev, const u8 *data) /* First Dword contains metadata and should be skipped */ buf_hdr = (struct bin_buffer_hdr *)(data + sizeof(u32)); - offset = buf_hdr[BIN_BUF_FW_VER_INFO].offset; + offset = buf_hdr[BIN_BUF_INIT_FW_VER_INFO].offset; fw->fw_ver_info = (struct fw_ver_info *)(data + offset); offset = buf_hdr[BIN_BUF_INIT_CMD].offset; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index b07bc810af92..4c212667b482 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -802,34 +802,6 @@ static u32 qed_mcp_get_shmem_func(struct qed_hwfn *p_hwfn, return size; } -int qed_hw_init_first_eth(struct qed_hwfn *p_hwfn, - struct qed_ptt *p_ptt, u8 *p_pf) -{ - struct public_func shmem_info; - int i; - - /* Find first Ethernet interface in port */ - for (i = 0; i < NUM_OF_ENG_PFS(p_hwfn->cdev); - i += p_hwfn->cdev->num_ports_in_engines) { - qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, - MCP_PF_ID_BY_REL(p_hwfn, i)); - - if (shmem_info.config & FUNC_MF_CFG_FUNC_HIDE) - continue; - - if ((shmem_info.config & FUNC_MF_CFG_PROTOCOL_MASK) == - FUNC_MF_CFG_PROTOCOL_ETHERNET) { - *p_pf = (u8)i; - return 0; - } - } - - DP_NOTICE(p_hwfn, - "Failed to find on port an ethernet interface in MF_SI mode\n"); - - return -EINVAL; -} - static void qed_mcp_update_bw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) { struct qed_mcp_function_info *p_info; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index 3324ef06f358..c6372fa574b7 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -500,6 +500,4 @@ int __qed_configure_pf_min_bandwidth(struct qed_hwfn *p_hwfn, struct qed_mcp_link_state *p_link, u8 min_bw); -int qed_hw_init_first_eth(struct qed_hwfn *p_hwfn, - struct qed_ptt *p_ptt, u8 *p_pf); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h index f6b86ca1ff79..b49d47f3de71 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h +++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h @@ -116,8 +116,14 @@ 0x1009c4UL #define QM_REG_PF_EN \ 0x2f2ea4UL +#define TCFC_REG_WEAK_ENABLE_VF \ + 0x2d0704UL #define TCFC_REG_STRONG_ENABLE_PF \ 0x2d0708UL +#define TCFC_REG_STRONG_ENABLE_VF \ + 0x2d070cUL +#define CCFC_REG_WEAK_ENABLE_VF \ + 0x2e0704UL #define CCFC_REG_STRONG_ENABLE_PF \ 0x2e0708UL #define PGLUE_B_REG_PGL_ADDR_88_F0 \ diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index 51e4c906833f..1579f339a5ef 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -1280,6 +1280,13 @@ static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn, memset(resp, 0, sizeof(*resp)); + /* Write the PF version so that VF would know which version + * is supported - might be later overriden. This guarantees that + * VF could recognize legacy PF based on lack of versions in reply. + */ + pfdev_info->major_fp_hsi = ETH_HSI_VER_MAJOR; + pfdev_info->minor_fp_hsi = ETH_HSI_VER_MINOR; + /* Validate FW compatibility */ if (req->vfdev_info.eth_fp_hsi_major != ETH_HSI_VER_MAJOR) { DP_INFO(p_hwfn, @@ -1289,12 +1296,6 @@ static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn, req->vfdev_info.eth_fp_hsi_minor, ETH_HSI_VER_MAJOR, ETH_HSI_VER_MINOR); - /* Write the PF version so that VF would know which version - * is supported. - */ - pfdev_info->major_fp_hsi = ETH_HSI_VER_MAJOR; - pfdev_info->minor_fp_hsi = ETH_HSI_VER_MINOR; - goto out; } diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index 81a63cf0a998..32325ca5951f 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -25,7 +25,7 @@ #define QEDE_MAJOR_VERSION 8 #define QEDE_MINOR_VERSION 10 -#define QEDE_REVISION_VERSION 1 +#define QEDE_REVISION_VERSION 9 #define QEDE_ENGINEERING_VERSION 20 #define DRV_MODULE_VERSION __stringify(QEDE_MAJOR_VERSION) "." \ __stringify(QEDE_MINOR_VERSION) "." \ diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 40c0ada01806..d306e0b55581 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -5,28 +5,83 @@ * (GPL) Version 2, available from the file COPYING in the main directory of * this source tree. */ +#ifndef _COMMON_HSI_H +#define _COMMON_HSI_H +#include +#include +#include +#include + +/* dma_addr_t manip */ +#define DMA_LO(x) ((u32)(((dma_addr_t)(x)) & 0xffffffff)) +#define DMA_HI(x) ((u32)(((dma_addr_t)(x)) >> 32)) + +#define DMA_LO_LE(x) cpu_to_le32(DMA_LO(x)) +#define DMA_HI_LE(x) cpu_to_le32(DMA_HI(x)) + +/* It's assumed that whoever includes this has previously included an hsi + * file defining the regpair. + */ +#define DMA_REGPAIR_LE(x, val) (x).hi = DMA_HI_LE((val)); \ + (x).lo = DMA_LO_LE((val)) + +#define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) +#define HILO_DMA(hi, lo) HILO_GEN(hi, lo, dma_addr_t) +#define HILO_64(hi, lo) HILO_GEN(hi, lo, u64) +#define HILO_DMA_REGPAIR(regpair) (HILO_DMA(regpair.hi, regpair.lo)) +#define HILO_64_REGPAIR(regpair) (HILO_64(regpair.hi, regpair.lo)) #ifndef __COMMON_HSI__ #define __COMMON_HSI__ -#define CORE_SPQE_PAGE_SIZE_BYTES 4096 #define X_FINAL_CLEANUP_AGG_INT 1 + +#define EVENT_RING_PAGE_SIZE_BYTES 4096 + #define NUM_OF_GLOBAL_QUEUES 128 +#define COMMON_QUEUE_ENTRY_MAX_BYTE_SIZE 64 + +#define ISCSI_CDU_TASK_SEG_TYPE 0 +#define RDMA_CDU_TASK_SEG_TYPE 1 + +#define FW_ASSERT_GENERAL_ATTN_IDX 32 + +#define MAX_PINNED_CCFC 32 /* Queue Zone sizes in bytes */ #define TSTORM_QZONE_SIZE 8 -#define MSTORM_QZONE_SIZE 0 +#define MSTORM_QZONE_SIZE 16 #define USTORM_QZONE_SIZE 8 #define XSTORM_QZONE_SIZE 8 #define YSTORM_QZONE_SIZE 0 #define PSTORM_QZONE_SIZE 0 -#define ETH_MAX_NUM_RX_QUEUES_PER_VF 16 +#define MSTORM_VF_ZONE_DEFAULT_SIZE_LOG 7 +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DEFAULT 16 +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DOUBLE 48 +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD 112 + +/********************************/ +/* CORE (LIGHT L2) FW CONSTANTS */ +/********************************/ + +#define CORE_LL2_MAX_RAMROD_PER_CON 8 +#define CORE_LL2_TX_BD_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_BD_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_CQE_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_NUM_NEXT_PAGE_BDS 1 + +#define CORE_LL2_TX_MAX_BDS_PER_PACKET 12 + +#define CORE_SPQE_PAGE_SIZE_BYTES 4096 + +#define MAX_NUM_LL2_RX_QUEUES 32 +#define MAX_NUM_LL2_TX_STATS_COUNTERS 32 #define FW_MAJOR_VERSION 8 #define FW_MINOR_VERSION 10 -#define FW_REVISION_VERSION 5 +#define FW_REVISION_VERSION 10 #define FW_ENGINEERING_VERSION 0 /***********************/ @@ -83,6 +138,17 @@ #define NUM_OF_LCIDS (320) #define NUM_OF_LTIDS (320) +/* Clock values */ +#define MASTER_CLK_FREQ_E4 (375e6) +#define STORM_CLK_FREQ_E4 (1000e6) +#define CLK25M_CLK_FREQ_E4 (25e6) + +/* Global PXP windows (GTT) */ +#define NUM_OF_GTT 19 +#define GTT_DWORD_SIZE_BITS 10 +#define GTT_BYTE_SIZE_BITS (GTT_DWORD_SIZE_BITS + 2) +#define GTT_DWORD_SIZE BIT(GTT_DWORD_SIZE_BITS) + /*****************/ /* CDU CONSTANTS */ /*****************/ @@ -90,6 +156,8 @@ #define CDU_SEG_TYPE_OFFSET_REG_TYPE_SHIFT (17) #define CDU_SEG_TYPE_OFFSET_REG_OFFSET_MASK (0x1ffff) +#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_TYPE_SHIFT (12) +#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_OFFSET_MASK (0xfff) /*****************/ /* DQ CONSTANTS */ /*****************/ @@ -115,6 +183,11 @@ #define DQ_XCM_ETH_TX_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 #define DQ_XCM_ETH_TX_BD_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 #define DQ_XCM_ETH_GO_TO_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD5 +#define DQ_XCM_ISCSI_SQ_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 +#define DQ_XCM_ISCSI_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3 +#define DQ_XCM_ISCSI_EXP_STAT_SN_CMD DQ_XCM_AGG_VAL_SEL_REG6 +#define DQ_XCM_ROCE_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 /* UCM agg val selection (HW) */ #define DQ_UCM_AGG_VAL_SEL_WORD0 0 @@ -159,13 +232,16 @@ #define DQ_XCM_AGG_FLG_SHIFT_CF23 7 /* XCM agg counter flag selection */ -#define DQ_XCM_CORE_DQ_CF_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF18) -#define DQ_XCM_CORE_TERMINATE_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) -#define DQ_XCM_CORE_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) -#define DQ_XCM_ETH_DQ_CF_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF18) -#define DQ_XCM_ETH_TERMINATE_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) -#define DQ_XCM_ETH_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) -#define DQ_XCM_ETH_TPH_EN_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_CORE_DQ_CF_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF18) +#define DQ_XCM_CORE_TERMINATE_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_CORE_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ETH_DQ_CF_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF18) +#define DQ_XCM_ETH_TERMINATE_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_ETH_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ETH_TPH_EN_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_ISCSI_DQ_FLUSH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_ISCSI_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23) /* UCM agg counter flag selection (HW) */ #define DQ_UCM_AGG_FLG_SHIFT_CF0 0 @@ -178,9 +254,45 @@ #define DQ_UCM_AGG_FLG_SHIFT_RULE1EN 7 /* UCM agg counter flag selection (FW) */ -#define DQ_UCM_ETH_PMD_TX_ARM_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF4) -#define DQ_UCM_ETH_PMD_RX_ARM_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF5) - +#define DQ_UCM_ETH_PMD_TX_ARM_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_ETH_PMD_RX_ARM_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) +#define DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_ROCE_CQ_ARM_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) + +/* TCM agg counter flag selection (HW) */ +#define DQ_TCM_AGG_FLG_SHIFT_CF0 0 +#define DQ_TCM_AGG_FLG_SHIFT_CF1 1 +#define DQ_TCM_AGG_FLG_SHIFT_CF2 2 +#define DQ_TCM_AGG_FLG_SHIFT_CF3 3 +#define DQ_TCM_AGG_FLG_SHIFT_CF4 4 +#define DQ_TCM_AGG_FLG_SHIFT_CF5 5 +#define DQ_TCM_AGG_FLG_SHIFT_CF6 6 +#define DQ_TCM_AGG_FLG_SHIFT_CF7 7 +/* TCM agg counter flag selection (FW) */ +#define DQ_TCM_ISCSI_FLUSH_Q0_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF1) +#define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF3) + +/* PWM address mapping */ +#define DQ_PWM_OFFSET_DPM_BASE 0x0 +#define DQ_PWM_OFFSET_DPM_END 0x27 +#define DQ_PWM_OFFSET_XCM16_BASE 0x40 +#define DQ_PWM_OFFSET_XCM32_BASE 0x44 +#define DQ_PWM_OFFSET_UCM16_BASE 0x48 +#define DQ_PWM_OFFSET_UCM32_BASE 0x4C +#define DQ_PWM_OFFSET_UCM16_4 0x50 +#define DQ_PWM_OFFSET_TCM16_BASE 0x58 +#define DQ_PWM_OFFSET_TCM32_BASE 0x5C +#define DQ_PWM_OFFSET_XCM_FLAGS 0x68 +#define DQ_PWM_OFFSET_UCM_FLAGS 0x69 +#define DQ_PWM_OFFSET_TCM_FLAGS 0x6B + +#define DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD (DQ_PWM_OFFSET_XCM16_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT (DQ_PWM_OFFSET_UCM32_BASE) +#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_16BIT (DQ_PWM_OFFSET_UCM16_4) +#define DQ_PWM_OFFSET_UCM_RDMA_INT_TIMEOUT (DQ_PWM_OFFSET_UCM16_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_ARM_FLAGS (DQ_PWM_OFFSET_UCM_FLAGS) +#define DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD (DQ_PWM_OFFSET_TCM16_BASE + 1) +#define DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD (DQ_PWM_OFFSET_TCM16_BASE + 3) #define DQ_REGION_SHIFT (12) /* DPM */ @@ -214,15 +326,17 @@ */ #define CM_TX_PQ_BASE 0x200 +/* number of global Vport/QCN rate limiters */ +#define MAX_QM_GLOBAL_RLS 256 /* QM registers data */ #define QM_LINE_CRD_REG_WIDTH 16 -#define QM_LINE_CRD_REG_SIGN_BIT (1 << (QM_LINE_CRD_REG_WIDTH - 1)) +#define QM_LINE_CRD_REG_SIGN_BIT BIT((QM_LINE_CRD_REG_WIDTH - 1)) #define QM_BYTE_CRD_REG_WIDTH 24 -#define QM_BYTE_CRD_REG_SIGN_BIT (1 << (QM_BYTE_CRD_REG_WIDTH - 1)) +#define QM_BYTE_CRD_REG_SIGN_BIT BIT((QM_BYTE_CRD_REG_WIDTH - 1)) #define QM_WFQ_CRD_REG_WIDTH 32 -#define QM_WFQ_CRD_REG_SIGN_BIT (1 << (QM_WFQ_CRD_REG_WIDTH - 1)) +#define QM_WFQ_CRD_REG_SIGN_BIT BIT((QM_WFQ_CRD_REG_WIDTH - 1)) #define QM_RL_CRD_REG_WIDTH 32 -#define QM_RL_CRD_REG_SIGN_BIT (1 << (QM_RL_CRD_REG_WIDTH - 1)) +#define QM_RL_CRD_REG_SIGN_BIT BIT((QM_RL_CRD_REG_WIDTH - 1)) /*****************/ /* CAU CONSTANTS */ @@ -287,6 +401,17 @@ /* PXP CONSTANTS */ /*****************/ +/* Bars for Blocks */ +#define PXP_BAR_GRC 0 +#define PXP_BAR_TSDM 0 +#define PXP_BAR_USDM 0 +#define PXP_BAR_XSDM 0 +#define PXP_BAR_MSDM 0 +#define PXP_BAR_YSDM 0 +#define PXP_BAR_PSDM 0 +#define PXP_BAR_IGU 0 +#define PXP_BAR_DQ 1 + /* PTT and GTT */ #define PXP_NUM_PF_WINDOWS 12 #define PXP_PER_PF_ENTRY_SIZE 8 @@ -334,6 +459,52 @@ (PXP_EXTERNAL_BAR_GLOBAL_WINDOW_START + \ PXP_EXTERNAL_BAR_GLOBAL_WINDOW_LENGTH - 1) +/* PF BAR */ +#define PXP_BAR0_START_GRC 0x0000 +#define PXP_BAR0_GRC_LENGTH 0x1C00000 +#define PXP_BAR0_END_GRC (PXP_BAR0_START_GRC + \ + PXP_BAR0_GRC_LENGTH - 1) + +#define PXP_BAR0_START_IGU 0x1C00000 +#define PXP_BAR0_IGU_LENGTH 0x10000 +#define PXP_BAR0_END_IGU (PXP_BAR0_START_IGU + \ + PXP_BAR0_IGU_LENGTH - 1) + +#define PXP_BAR0_START_TSDM 0x1C80000 +#define PXP_BAR0_SDM_LENGTH 0x40000 +#define PXP_BAR0_SDM_RESERVED_LENGTH 0x40000 +#define PXP_BAR0_END_TSDM (PXP_BAR0_START_TSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_MSDM 0x1D00000 +#define PXP_BAR0_END_MSDM (PXP_BAR0_START_MSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_USDM 0x1D80000 +#define PXP_BAR0_END_USDM (PXP_BAR0_START_USDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_XSDM 0x1E00000 +#define PXP_BAR0_END_XSDM (PXP_BAR0_START_XSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_YSDM 0x1E80000 +#define PXP_BAR0_END_YSDM (PXP_BAR0_START_YSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_PSDM 0x1F00000 +#define PXP_BAR0_END_PSDM (PXP_BAR0_START_PSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_FIRST_INVALID_ADDRESS (PXP_BAR0_END_PSDM + 1) + +/* VF BAR */ +#define PXP_VF_BAR0 0 + +#define PXP_VF_BAR0_START_GRC 0x3E00 +#define PXP_VF_BAR0_GRC_LENGTH 0x200 +#define PXP_VF_BAR0_END_GRC (PXP_VF_BAR0_START_GRC + \ + PXP_VF_BAR0_GRC_LENGTH - 1) #define PXP_VF_BAR0_START_IGU 0 #define PXP_VF_BAR0_IGU_LENGTH 0x3000 @@ -399,6 +570,20 @@ #define PXP_NUM_ILT_RECORDS_BB 7600 #define PXP_NUM_ILT_RECORDS_K2 11000 #define MAX_NUM_ILT_RECORDS MAX(PXP_NUM_ILT_RECORDS_BB, PXP_NUM_ILT_RECORDS_K2) +#define PXP_QUEUES_ZONE_MAX_NUM 320 +/*****************/ +/* PRM CONSTANTS */ +/*****************/ +#define PRM_DMA_PAD_BYTES_NUM 2 +/******************/ +/* SDMs CONSTANTS */ +/******************/ +#define SDM_OP_GEN_TRIG_NONE 0 +#define SDM_OP_GEN_TRIG_WAKE_THREAD 1 +#define SDM_OP_GEN_TRIG_AGG_INT 2 +#define SDM_OP_GEN_TRIG_LOADER 4 +#define SDM_OP_GEN_TRIG_INDICATE_ERROR 6 +#define SDM_OP_GEN_TRIG_RELEASE_THREAD 7 #define SDM_COMP_TYPE_NONE 0 #define SDM_COMP_TYPE_WAKE_THREAD 1 @@ -424,6 +609,8 @@ /* PRS CONSTANTS */ /*****************/ +#define PRS_GFT_CAM_LINES_NO_MATCH 31 + /* Async data KCQ CQE */ struct async_data { __le32 cid; @@ -440,20 +627,6 @@ struct coalescing_timeset { #define COALESCING_TIMESET_VALID_SHIFT 7 }; -struct common_prs_pf_msg_info { - __le32 value; -#define COMMON_PRS_PF_MSG_INFO_NPAR_DEFAULT_PF_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_NPAR_DEFAULT_PF_SHIFT 0 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_1_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_1_SHIFT 1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_2_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_2_SHIFT 2 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_3_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_3_SHIFT 3 -#define COMMON_PRS_PF_MSG_INFO_RESERVED_MASK 0xFFFFFFF -#define COMMON_PRS_PF_MSG_INFO_RESERVED_SHIFT 4 -}; - struct common_queue_zone { __le16 ring_drv_data_consumer; __le16 reserved; @@ -473,6 +646,19 @@ struct vf_pf_channel_eqe_data { struct regpair msg_addr; }; +struct iscsi_eqe_data { + __le32 cid; + __le16 conn_id; + u8 error_code; + u8 error_pdu_opcode_reserved; +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_MASK 0x3F +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_SHIFT 0 +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_MASK 0x1 +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_SHIFT 6 +#define ISCSI_EQE_DATA_RESERVED0_MASK 0x1 +#define ISCSI_EQE_DATA_RESERVED0_SHIFT 7 +}; + struct malicious_vf_eqe_data { u8 vf_id; u8 err_id; @@ -488,6 +674,7 @@ struct initial_cleanup_eqe_data { union event_ring_data { u8 bytes[8]; struct vf_pf_channel_eqe_data vf_pf_channel; + struct iscsi_eqe_data iscsi_info; struct malicious_vf_eqe_data malicious_vf; struct initial_cleanup_eqe_data vf_init_cleanup; }; @@ -616,6 +803,52 @@ enum db_dest { MAX_DB_DEST }; +/* Enum of doorbell DPM types */ +enum db_dpm_type { + DPM_LEGACY, + DPM_ROCE, + DPM_L2_INLINE, + DPM_L2_BD, + MAX_DB_DPM_TYPE +}; + +/* Structure for doorbell data, in L2 DPM mode, for 1st db in a DPM burst */ +struct db_l2_dpm_data { + __le16 icid; + __le16 bd_prod; + __le32 params; +#define DB_L2_DPM_DATA_SIZE_MASK 0x3F +#define DB_L2_DPM_DATA_SIZE_SHIFT 0 +#define DB_L2_DPM_DATA_DPM_TYPE_MASK 0x3 +#define DB_L2_DPM_DATA_DPM_TYPE_SHIFT 6 +#define DB_L2_DPM_DATA_NUM_BDS_MASK 0xFF +#define DB_L2_DPM_DATA_NUM_BDS_SHIFT 8 +#define DB_L2_DPM_DATA_PKT_SIZE_MASK 0x7FF +#define DB_L2_DPM_DATA_PKT_SIZE_SHIFT 16 +#define DB_L2_DPM_DATA_RESERVED0_MASK 0x1 +#define DB_L2_DPM_DATA_RESERVED0_SHIFT 27 +#define DB_L2_DPM_DATA_SGE_NUM_MASK 0x7 +#define DB_L2_DPM_DATA_SGE_NUM_SHIFT 28 +#define DB_L2_DPM_DATA_RESERVED1_MASK 0x1 +#define DB_L2_DPM_DATA_RESERVED1_SHIFT 31 +}; + +/* Structure for SGE in a DPM doorbell of type DPM_L2_BD */ +struct db_l2_dpm_sge { + struct regpair addr; + __le16 nbytes; + __le16 bitfields; +#define DB_L2_DPM_SGE_TPH_ST_INDEX_MASK 0x1FF +#define DB_L2_DPM_SGE_TPH_ST_INDEX_SHIFT 0 +#define DB_L2_DPM_SGE_RESERVED0_MASK 0x3 +#define DB_L2_DPM_SGE_RESERVED0_SHIFT 9 +#define DB_L2_DPM_SGE_ST_VALID_MASK 0x1 +#define DB_L2_DPM_SGE_ST_VALID_SHIFT 11 +#define DB_L2_DPM_SGE_RESERVED1_MASK 0xF +#define DB_L2_DPM_SGE_RESERVED1_SHIFT 12 + __le32 reserved2; +}; + /* Structure for doorbell address, in legacy mode */ struct db_legacy_addr { __le32 addr; @@ -627,6 +860,49 @@ struct db_legacy_addr { #define DB_LEGACY_ADDR_ICID_SHIFT 5 }; +/* Structure for doorbell address, in PWM mode */ +struct db_pwm_addr { + __le32 addr; +#define DB_PWM_ADDR_RESERVED0_MASK 0x7 +#define DB_PWM_ADDR_RESERVED0_SHIFT 0 +#define DB_PWM_ADDR_OFFSET_MASK 0x7F +#define DB_PWM_ADDR_OFFSET_SHIFT 3 +#define DB_PWM_ADDR_WID_MASK 0x3 +#define DB_PWM_ADDR_WID_SHIFT 10 +#define DB_PWM_ADDR_DPI_MASK 0xFFFF +#define DB_PWM_ADDR_DPI_SHIFT 12 +#define DB_PWM_ADDR_RESERVED1_MASK 0xF +#define DB_PWM_ADDR_RESERVED1_SHIFT 28 +}; + +/* Parameters to RoCE firmware, passed in EDPM doorbell */ +struct db_roce_dpm_params { + __le32 params; +#define DB_ROCE_DPM_PARAMS_SIZE_MASK 0x3F +#define DB_ROCE_DPM_PARAMS_SIZE_SHIFT 0 +#define DB_ROCE_DPM_PARAMS_DPM_TYPE_MASK 0x3 +#define DB_ROCE_DPM_PARAMS_DPM_TYPE_SHIFT 6 +#define DB_ROCE_DPM_PARAMS_OPCODE_MASK 0xFF +#define DB_ROCE_DPM_PARAMS_OPCODE_SHIFT 8 +#define DB_ROCE_DPM_PARAMS_WQE_SIZE_MASK 0x7FF +#define DB_ROCE_DPM_PARAMS_WQE_SIZE_SHIFT 16 +#define DB_ROCE_DPM_PARAMS_RESERVED0_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_RESERVED0_SHIFT 27 +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_SHIFT 28 +#define DB_ROCE_DPM_PARAMS_S_FLG_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_S_FLG_SHIFT 29 +#define DB_ROCE_DPM_PARAMS_RESERVED1_MASK 0x3 +#define DB_ROCE_DPM_PARAMS_RESERVED1_SHIFT 30 +}; + +/* Structure for doorbell data, in ROCE DPM mode, for 1st db in a DPM burst */ +struct db_roce_dpm_data { + __le16 icid; + __le16 prod_val; + struct db_roce_dpm_params params; +}; + /* Igu interrupt command */ enum igu_int_cmd { IGU_INT_ENABLE = 0, @@ -764,6 +1040,19 @@ struct pxp_ptt_entry { struct pxp_pretend_cmd pretend; }; +/* VF Zone A Permission Register. */ +struct pxp_vf_zone_a_permission { + __le32 control; +#define PXP_VF_ZONE_A_PERMISSION_VFID_MASK 0xFF +#define PXP_VF_ZONE_A_PERMISSION_VFID_SHIFT 0 +#define PXP_VF_ZONE_A_PERMISSION_VALID_MASK 0x1 +#define PXP_VF_ZONE_A_PERMISSION_VALID_SHIFT 8 +#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_MASK 0x7F +#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_SHIFT 9 +#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_MASK 0xFFFF +#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_SHIFT 16 +}; + /* RSS hash type */ struct rdif_task_context { __le32 initial_ref_tag; @@ -831,6 +1120,7 @@ struct rdif_task_context { __le32 reserved2; }; +/* RSS hash type */ enum rss_hash_type { RSS_HASH_TYPE_DEFAULT = 0, RSS_HASH_TYPE_IPV4 = 1, @@ -942,7 +1232,7 @@ struct tdif_task_context { }; struct timers_context { - __le32 logical_client0; + __le32 logical_client_0; #define TIMERS_CONTEXT_EXPIRATIONTIMELC0_MASK 0xFFFFFFF #define TIMERS_CONTEXT_EXPIRATIONTIMELC0_SHIFT 0 #define TIMERS_CONTEXT_VALIDLC0_MASK 0x1 @@ -951,7 +1241,7 @@ struct timers_context { #define TIMERS_CONTEXT_ACTIVELC0_SHIFT 29 #define TIMERS_CONTEXT_RESERVED0_MASK 0x3 #define TIMERS_CONTEXT_RESERVED0_SHIFT 30 - __le32 logical_client1; + __le32 logical_client_1; #define TIMERS_CONTEXT_EXPIRATIONTIMELC1_MASK 0xFFFFFFF #define TIMERS_CONTEXT_EXPIRATIONTIMELC1_SHIFT 0 #define TIMERS_CONTEXT_VALIDLC1_MASK 0x1 @@ -960,7 +1250,7 @@ struct timers_context { #define TIMERS_CONTEXT_ACTIVELC1_SHIFT 29 #define TIMERS_CONTEXT_RESERVED1_MASK 0x3 #define TIMERS_CONTEXT_RESERVED1_SHIFT 30 - __le32 logical_client2; + __le32 logical_client_2; #define TIMERS_CONTEXT_EXPIRATIONTIMELC2_MASK 0xFFFFFFF #define TIMERS_CONTEXT_EXPIRATIONTIMELC2_SHIFT 0 #define TIMERS_CONTEXT_VALIDLC2_MASK 0x1 @@ -978,3 +1268,4 @@ struct timers_context { #define TIMERS_CONTEXT_RESERVED3_SHIFT 29 }; #endif /* __COMMON_HSI__ */ +#endif diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h index b5ebc697d05f..1aa0727c4136 100644 --- a/include/linux/qed/eth_common.h +++ b/include/linux/qed/eth_common.h @@ -13,9 +13,12 @@ /* ETH FW CONSTANTS */ /********************/ #define ETH_HSI_VER_MAJOR 3 -#define ETH_HSI_VER_MINOR 0 -#define ETH_CACHE_LINE_SIZE 64 +#define ETH_HSI_VER_MINOR 10 + +#define ETH_HSI_VER_NO_PKT_LEN_TUNN 5 +#define ETH_CACHE_LINE_SIZE 64 +#define ETH_RX_CQE_GAP 32 #define ETH_MAX_RAMROD_PER_CON 8 #define ETH_TX_BD_PAGE_SIZE_BYTES 4096 #define ETH_RX_BD_PAGE_SIZE_BYTES 4096 @@ -24,15 +27,25 @@ #define ETH_TX_MIN_BDS_PER_NON_LSO_PKT 1 #define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET 18 +#define ETH_TX_MAX_BDS_PER_LSO_PACKET 255 #define ETH_TX_MAX_LSO_HDR_NBD 4 #define ETH_TX_MIN_BDS_PER_LSO_PKT 3 #define ETH_TX_MIN_BDS_PER_TUNN_IPV6_WITH_EXT_PKT 3 #define ETH_TX_MIN_BDS_PER_IPV6_WITH_EXT_PKT 2 #define ETH_TX_MIN_BDS_PER_PKT_W_LOOPBACK_MODE 2 -#define ETH_TX_MAX_NON_LSO_PKT_LEN (9700 - (4 + 12 + 8)) +#define ETH_TX_MAX_NON_LSO_PKT_LEN (9700 - (4 + 4 + 12 + 8)) #define ETH_TX_MAX_LSO_HDR_BYTES 510 +#define ETH_TX_LSO_WINDOW_BDS_NUM (18 - 1) +#define ETH_TX_LSO_WINDOW_MIN_LEN 9700 +#define ETH_TX_MAX_LSO_PAYLOAD_LEN 0xFE000 +#define ETH_TX_NUM_SAME_AS_LAST_ENTRIES 320 +#define ETH_TX_INACTIVE_SAME_AS_LAST 0xFFFF #define ETH_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS +#define ETH_NUM_STATISTIC_COUNTERS_DOUBLE_VF_ZONE \ + (ETH_NUM_STATISTIC_COUNTERS - MAX_NUM_VFS / 2) +#define ETH_NUM_STATISTIC_COUNTERS_QUAD_VF_ZONE \ + (ETH_NUM_STATISTIC_COUNTERS - 3 * MAX_NUM_VFS / 4) /* Maximum number of buffers, used for RX packet placement */ #define ETH_RX_MAX_BUFF_PER_PKT 5 @@ -59,6 +72,8 @@ #define ETH_TPA_CQE_CONT_LEN_LIST_SIZE 6 #define ETH_TPA_CQE_END_LEN_LIST_SIZE 4 +/* Control frame check constants */ +#define ETH_CTL_FRAME_ETH_TYPE_NUM 4 struct eth_tx_1st_bd_flags { u8 bitfields; @@ -82,10 +97,10 @@ struct eth_tx_1st_bd_flags { /* The parsing information data fo rthe first tx bd of a given packet. */ struct eth_tx_data_1st_bd { - __le16 vlan; - u8 nbds; - struct eth_tx_1st_bd_flags bd_flags; - __le16 bitfields; + __le16 vlan; + u8 nbds; + struct eth_tx_1st_bd_flags bd_flags; + __le16 bitfields; #define ETH_TX_DATA_1ST_BD_TUNN_FLAG_MASK 0x1 #define ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT 0 #define ETH_TX_DATA_1ST_BD_RESERVED0_MASK 0x1 @@ -96,7 +111,7 @@ struct eth_tx_data_1st_bd { /* The parsing information data for the second tx bd of a given packet. */ struct eth_tx_data_2nd_bd { - __le16 tunn_ip_size; + __le16 tunn_ip_size; __le16 bitfields1; #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_MASK 0xF #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_SHIFT 0 @@ -125,9 +140,14 @@ struct eth_tx_data_2nd_bd { #define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT 13 }; +/* Firmware data for L2-EDPM packet. */ +struct eth_edpm_fw_data { + struct eth_tx_data_1st_bd data_1st_bd; + struct eth_tx_data_2nd_bd data_2nd_bd; + __le32 reserved; +}; + struct eth_fast_path_cqe_fw_debug { - u8 reserved0; - u8 reserved1; __le16 reserved2; }; @@ -148,6 +168,17 @@ struct eth_tunnel_parsing_flags { #define ETH_TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_SHIFT 7 }; +/* PMD flow control bits */ +struct eth_pmd_flow_flags { + u8 flags; +#define ETH_PMD_FLOW_FLAGS_VALID_MASK 0x1 +#define ETH_PMD_FLOW_FLAGS_VALID_SHIFT 0 +#define ETH_PMD_FLOW_FLAGS_TOGGLE_MASK 0x1 +#define ETH_PMD_FLOW_FLAGS_TOGGLE_SHIFT 1 +#define ETH_PMD_FLOW_FLAGS_RESERVED_MASK 0x3F +#define ETH_PMD_FLOW_FLAGS_RESERVED_SHIFT 2 +}; + /* Regular ETH Rx FP CQE. */ struct eth_fast_path_rx_reg_cqe { u8 type; @@ -166,64 +197,63 @@ struct eth_fast_path_rx_reg_cqe { u8 placement_offset; struct eth_tunnel_parsing_flags tunnel_pars_flags; u8 bd_num; - u8 reserved[7]; + u8 reserved[9]; struct eth_fast_path_cqe_fw_debug fw_debug; u8 reserved1[3]; - u8 flags; -#define ETH_FAST_PATH_RX_REG_CQE_VALID_MASK 0x1 -#define ETH_FAST_PATH_RX_REG_CQE_VALID_SHIFT 0 -#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_MASK 0x1 -#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_SHIFT 1 -#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_MASK 0x3F -#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_SHIFT 2 + struct eth_pmd_flow_flags pmd_flags; }; /* TPA-continue ETH Rx FP CQE. */ struct eth_fast_path_rx_tpa_cont_cqe { - u8 type; - u8 tpa_agg_index; - __le16 len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; - u8 reserved[5]; - u8 reserved1; - __le16 reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 type; + u8 tpa_agg_index; + __le16 len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 reserved; + u8 reserved1; + __le16 reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 reserved3[3]; + struct eth_pmd_flow_flags pmd_flags; }; /* TPA-end ETH Rx FP CQE. */ struct eth_fast_path_rx_tpa_end_cqe { - u8 type; - u8 tpa_agg_index; - __le16 total_packet_len; - u8 num_of_bds; - u8 end_reason; - __le16 num_of_coalesced_segs; - __le32 ts_delta; - __le16 len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE]; - u8 reserved1[3]; - u8 reserved2; - __le16 reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + u8 type; + u8 tpa_agg_index; + __le16 total_packet_len; + u8 num_of_bds; + u8 end_reason; + __le16 num_of_coalesced_segs; + __le32 ts_delta; + __le16 len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + __le16 reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + __le16 reserved1; + u8 reserved2; + struct eth_pmd_flow_flags pmd_flags; }; /* TPA-start ETH Rx FP CQE. */ struct eth_fast_path_rx_tpa_start_cqe { - u8 type; - u8 bitfields; + u8 type; + u8 bitfields; #define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_MASK 0x7 #define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_SHIFT 0 #define ETH_FAST_PATH_RX_TPA_START_CQE_TC_MASK 0xF #define ETH_FAST_PATH_RX_TPA_START_CQE_TC_SHIFT 3 #define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_MASK 0x1 #define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_SHIFT 7 - __le16 seg_len; + __le16 seg_len; struct parsing_and_err_flags pars_flags; - __le16 vlan_tag; - __le32 rss_hash; - __le16 len_on_first_bd; - u8 placement_offset; + __le16 vlan_tag; + __le32 rss_hash; + __le16 len_on_first_bd; + u8 placement_offset; struct eth_tunnel_parsing_flags tunnel_pars_flags; - u8 tpa_agg_index; - u8 header_len; - __le16 ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE]; + u8 tpa_agg_index; + u8 header_len; + __le16 ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE]; struct eth_fast_path_cqe_fw_debug fw_debug; + u8 reserved; + struct eth_pmd_flow_flags pmd_flags; }; /* The L4 pseudo checksum mode for Ethernet */ @@ -245,15 +275,7 @@ struct eth_slow_path_rx_cqe { u8 reserved[25]; __le16 echo; u8 reserved1; - u8 flags; -/* for PMD mode - valid indication */ -#define ETH_SLOW_PATH_RX_CQE_VALID_MASK 0x1 -#define ETH_SLOW_PATH_RX_CQE_VALID_SHIFT 0 -/* for PMD mode - valid toggle indication */ -#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_MASK 0x1 -#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_SHIFT 1 -#define ETH_SLOW_PATH_RX_CQE_RESERVED2_MASK 0x3F -#define ETH_SLOW_PATH_RX_CQE_RESERVED2_SHIFT 2 + struct eth_pmd_flow_flags pmd_flags; }; /* union for all ETH Rx CQE types */ @@ -276,6 +298,11 @@ enum eth_rx_cqe_type { MAX_ETH_RX_CQE_TYPE }; +struct eth_rx_pmd_cqe { + union eth_rx_cqe cqe; + u8 reserved[ETH_RX_CQE_GAP]; +}; + enum eth_rx_tunn_type { ETH_RX_NO_TUNN, ETH_RX_TUNN_GENEVE, @@ -313,8 +340,8 @@ struct eth_tx_2nd_bd { /* The parsing information data for the third tx bd of a given packet. */ struct eth_tx_data_3rd_bd { - __le16 lso_mss; - __le16 bitfields; + __le16 lso_mss; + __le16 bitfields; #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK 0xF #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT 0 #define ETH_TX_DATA_3RD_BD_HDR_NBD_MASK 0xF @@ -323,8 +350,8 @@ struct eth_tx_data_3rd_bd { #define ETH_TX_DATA_3RD_BD_START_BD_SHIFT 8 #define ETH_TX_DATA_3RD_BD_RESERVED0_MASK 0x7F #define ETH_TX_DATA_3RD_BD_RESERVED0_SHIFT 9 - u8 tunn_l4_hdr_start_offset_w; - u8 tunn_hdr_size_w; + u8 tunn_l4_hdr_start_offset_w; + u8 tunn_hdr_size_w; }; /* The third tx bd of a given packet */ @@ -355,10 +382,10 @@ struct eth_tx_bd { }; union eth_tx_bd_types { - struct eth_tx_1st_bd first_bd; - struct eth_tx_2nd_bd second_bd; - struct eth_tx_3rd_bd third_bd; - struct eth_tx_bd reg_bd; + struct eth_tx_1st_bd first_bd; + struct eth_tx_2nd_bd second_bd; + struct eth_tx_3rd_bd third_bd; + struct eth_tx_bd reg_bd; }; /* Mstorm Queue Zone */ @@ -389,8 +416,8 @@ struct eth_db_data { #define ETH_DB_DATA_RESERVED_SHIFT 5 #define ETH_DB_DATA_AGG_VAL_SEL_MASK 0x3 #define ETH_DB_DATA_AGG_VAL_SEL_SHIFT 6 - u8 agg_flags; - __le16 bd_prod; + u8 agg_flags; + __le16 bd_prod; }; #endif /* __ETH_COMMON__ */ diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h index b3c0feb15ae9..8f64b1223c2f 100644 --- a/include/linux/qed/iscsi_common.h +++ b/include/linux/qed/iscsi_common.h @@ -311,7 +311,7 @@ struct iscsi_login_req_hdr { #define ISCSI_LOGIN_REQ_HDR_DATA_SEG_LEN_SHIFT 0 #define ISCSI_LOGIN_REQ_HDR_TOTAL_AHS_LEN_MASK 0xFF #define ISCSI_LOGIN_REQ_HDR_TOTAL_AHS_LEN_SHIFT 24 - __le32 isid_TABC; + __le32 isid_tabc; __le16 tsih; __le16 isid_d; __le32 itt; @@ -464,7 +464,7 @@ struct iscsi_login_response_hdr { #define ISCSI_LOGIN_RESPONSE_HDR_DATA_SEG_LEN_SHIFT 0 #define ISCSI_LOGIN_RESPONSE_HDR_TOTAL_AHS_LEN_MASK 0xFF #define ISCSI_LOGIN_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT 24 - __le32 isid_TABC; + __le32 isid_tabc; __le16 tsih; __le16 isid_d; __le32 itt; @@ -688,8 +688,7 @@ union iscsi_cqe { enum iscsi_cqes_type { ISCSI_CQE_TYPE_SOLICITED = 1, ISCSI_CQE_TYPE_UNSOLICITED, - ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE - , + ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE, ISCSI_CQE_TYPE_TASK_CLEANUP, ISCSI_CQE_TYPE_DUMMY, MAX_ISCSI_CQES_TYPE @@ -769,9 +768,9 @@ enum iscsi_eqe_opcode { ISCSI_EVENT_TYPE_UPDATE_CONN, ISCSI_EVENT_TYPE_CLEAR_SQ, ISCSI_EVENT_TYPE_TERMINATE_CONN, + ISCSI_EVENT_TYPE_MAC_UPDATE_CONN, ISCSI_EVENT_TYPE_ASYN_CONNECT_COMPLETE, ISCSI_EVENT_TYPE_ASYN_TERMINATE_DONE, - RESERVED8, RESERVED9, ISCSI_EVENT_TYPE_START_OF_ERROR_TYPES = 10, ISCSI_EVENT_TYPE_ASYN_ABORT_RCVD, @@ -867,6 +866,7 @@ enum iscsi_ramrod_cmd_id { ISCSI_RAMROD_CMD_ID_UPDATE_CONN = 4, ISCSI_RAMROD_CMD_ID_TERMINATION_CONN = 5, ISCSI_RAMROD_CMD_ID_CLEAR_SQ = 6, + ISCSI_RAMROD_CMD_ID_MAC_UPDATE = 7, MAX_ISCSI_RAMROD_CMD_ID }; @@ -883,6 +883,16 @@ union iscsi_seq_num { __le16 r2t_sn; }; +struct iscsi_spe_conn_mac_update { + struct iscsi_slow_path_hdr hdr; + __le16 conn_id; + __le32 fw_cid; + __le16 remote_mac_addr_lo; + __le16 remote_mac_addr_mid; + __le16 remote_mac_addr_hi; + u8 reserved0[2]; +}; + struct iscsi_spe_conn_offload { struct iscsi_slow_path_hdr hdr; __le16 conn_id; @@ -1302,14 +1312,6 @@ struct mstorm_iscsi_stats_drv { struct regpair iscsi_rx_dropped_pdus_task_not_valid; }; -struct ooo_opaque { - __le32 cid; - u8 drop_isle; - u8 drop_size; - u8 ooo_opcode; - u8 ooo_isle; -}; - struct pstorm_iscsi_stats_drv { struct regpair iscsi_tx_bytes_cnt; struct regpair iscsi_tx_packet_cnt; diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 7e441bdeabdc..72d88cf3ca25 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -16,19 +16,6 @@ #include #include -/* dma_addr_t manip */ -#define DMA_LO_LE(x) cpu_to_le32(lower_32_bits(x)) -#define DMA_HI_LE(x) cpu_to_le32(upper_32_bits(x)) -#define DMA_REGPAIR_LE(x, val) do { \ - (x).hi = DMA_HI_LE((val)); \ - (x).lo = DMA_LO_LE((val)); \ - } while (0) - -#define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) -#define HILO_64(hi, lo) HILO_GEN((le32_to_cpu(hi)), (le32_to_cpu(lo)), u64) -#define HILO_64_REGPAIR(regpair) (HILO_64(regpair.hi, regpair.lo)) -#define HILO_DMA_REGPAIR(regpair) ((dma_addr_t)HILO_64_REGPAIR(regpair)) - enum qed_chain_mode { /* Each Page contains a next pointer at its end */ QED_CHAIN_MODE_NEXT_PTR, diff --git a/include/linux/qed/tcp_common.h b/include/linux/qed/tcp_common.h index accba0e6b704..dc3889d1bbe6 100644 --- a/include/linux/qed/tcp_common.h +++ b/include/linux/qed/tcp_common.h @@ -11,6 +11,14 @@ #define TCP_INVALID_TIMEOUT_VAL -1 +struct ooo_opaque { + __le32 cid; + u8 drop_isle; + u8 drop_size; + u8 ooo_opcode; + u8 ooo_isle; +}; + enum tcp_connect_mode { TCP_CONNECT_ACTIVE, TCP_CONNECT_PASSIVE, @@ -18,14 +26,10 @@ enum tcp_connect_mode { }; struct tcp_init_params { - __le32 max_cwnd; - __le16 dup_ack_threshold; + __le32 two_msl_timer; __le16 tx_sws_timer; - __le16 min_rto; - __le16 min_rto_rt; - __le16 max_rto; u8 maxfinrt; - u8 reserved[1]; + u8 reserved[9]; }; enum tcp_ip_version { -- cgit v1.2.3 From 3d2f30a1df907e3ef4175121f0d21456630a72aa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Aug 2016 01:46:06 +0200 Subject: netfilter: nf_tables: add quota expression This patch adds the quota expression. This new stateful expression integrate easily into the dynset expression to build 'hashquota' flow tables. Arguably, we could use instead "counter bytes > 1000" instead, but this approach has several problems: 1) We only support for one single stateful expression in dynamic set definitions, and the expression above is a composite of two expressions: get counter + comparison. 2) We would need to restore the packed counter representation (that we used to have) based on seqlock to synchronize this, since per-cpu is not suitable for this. So instead of bloating the counter expression back with the seqlock representation and extending the existing set infrastructure to make it more complex for the composite described above, let's follow the more simple approach of adding a quota expression that we can plug into our existing infrastructure. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 19 +++++ net/netfilter/Kconfig | 6 ++ net/netfilter/Makefile | 1 + net/netfilter/nft_quota.c | 121 +++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 net/netfilter/nft_quota.c (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6ce0a6dd0889..784fbf15ab3d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -900,6 +900,25 @@ enum nft_queue_attributes { #define NFT_QUEUE_FLAG_CPU_FANOUT 0x02 /* use current CPU (no hashing) */ #define NFT_QUEUE_FLAG_MASK 0x03 +enum nft_quota_flags { + NFT_QUOTA_F_INV = (1 << 0), +}; + +/** + * enum nft_quota_attributes - nf_tables quota expression netlink attributes + * + * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16) + * @NFTA_QUOTA_FLAGS: flags (NLA_U32) + */ +enum nft_quota_attributes { + NFTA_QUOTA_UNSPEC, + NFTA_QUOTA_BYTES, + NFTA_QUOTA_FLAGS, + NFTA_QUOTA_PAD, + __NFTA_QUOTA_MAX +}; +#define NFTA_QUOTA_MAX (__NFTA_QUOTA_MAX - 1) + /** * enum nft_reject_types - nf_tables reject expression reject types * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9cfaa00c79b2..29a8078deafa 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -542,6 +542,12 @@ config NFT_QUEUE This is required if you intend to use the userspace queueing infrastructure (also known as NFQUEUE) from nftables. +config NFT_QUOTA + tristate "Netfilter nf_tables quota module" + help + This option adds the "quota" expression that you can use to match + enforce bytes quotas. + config NFT_REJECT default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1106ccde215c..0fc42df19b8c 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o +obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c new file mode 100644 index 000000000000..6eafbf987ed9 --- /dev/null +++ b/net/netfilter/nft_quota.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2016 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_quota { + u64 quota; + bool invert; + atomic64_t remain; +}; + +static inline long nft_quota(struct nft_quota *priv, + const struct nft_pktinfo *pkt) +{ + return atomic64_sub_return(pkt->skb->len, &priv->remain); +} + +static void nft_quota_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + if (nft_quota(priv, pkt) < 0 && !priv->invert) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = { + [NFTA_QUOTA_BYTES] = { .type = NLA_U64 }, + [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 }, +}; + +static int nft_quota_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_quota *priv = nft_expr_priv(expr); + u32 flags = 0; + u64 quota; + + if (!tb[NFTA_QUOTA_BYTES]) + return -EINVAL; + + quota = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_BYTES])); + if (quota > S64_MAX) + return -EOVERFLOW; + + if (tb[NFTA_QUOTA_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS])); + if (flags & ~NFT_QUOTA_F_INV) + return -EINVAL; + } + + priv->quota = quota; + priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false; + atomic64_set(&priv->remain, quota); + + return 0; +} + +static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_quota *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; + + if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), + NFTA_QUOTA_PAD) || + nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_quota_type; +static const struct nft_expr_ops nft_quota_ops = { + .type = &nft_quota_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_quota)), + .eval = nft_quota_eval, + .init = nft_quota_init, + .dump = nft_quota_dump, +}; + +static struct nft_expr_type nft_quota_type __read_mostly = { + .name = "quota", + .ops = &nft_quota_ops, + .policy = nft_quota_policy, + .maxattr = NFTA_QUOTA_MAX, + .flags = NFT_EXPR_STATEFUL, + .owner = THIS_MODULE, +}; + +static int __init nft_quota_module_init(void) +{ + return nft_register_expr(&nft_quota_type); +} + +static void __exit nft_quota_module_exit(void) +{ + nft_unregister_expr(&nft_quota_type); +} + +module_init(nft_quota_module_init); +module_exit(nft_quota_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_EXPR("quota"); -- cgit v1.2.3 From 91dbc6be0a62d3bcea98287734d593610aed507d Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Thu, 18 Aug 2016 12:13:13 +0200 Subject: netfilter: nf_tables: add number generator expression This patch adds the numgen expression that allows us to generated incremental and random numbers, this generator is bound to a upper limit that is specified by userspace. This expression is useful to distribute packets in a round-robin fashion as well as randomly. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 24 ++++ net/netfilter/Kconfig | 6 + net/netfilter/Makefile | 1 + net/netfilter/nft_numgen.c | 192 +++++++++++++++++++++++++++++++ 4 files changed, 223 insertions(+) create mode 100644 net/netfilter/nft_numgen.c (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 784fbf15ab3d..8c9d6ff70ec0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1121,4 +1121,28 @@ enum nft_trace_types { __NFT_TRACETYPE_MAX }; #define NFT_TRACETYPE_MAX (__NFT_TRACETYPE_MAX - 1) + +/** + * enum nft_ng_attributes - nf_tables number generator expression netlink attributes + * + * @NFTA_NG_DREG: destination register (NLA_U32) + * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32) + * @NFTA_NG_TYPE: operation type (NLA_U32) + */ +enum nft_ng_attributes { + NFTA_NG_UNSPEC, + NFTA_NG_DREG, + NFTA_NG_UNTIL, + NFTA_NG_TYPE, + __NFTA_NG_MAX +}; +#define NFTA_NG_MAX (__NFTA_NG_MAX - 1) + +enum nft_ng_types { + NFT_NG_INCREMENTAL, + NFT_NG_RANDOM, + __NFT_NG_MAX +}; +#define NFT_NG_MAX (__NFT_NG_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 29a8078deafa..e8d56d9a4df2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -474,6 +474,12 @@ config NFT_META This option adds the "meta" expression that you can use to match and to set packet metainformation such as the packet mark. +config NFT_NUMGEN + tristate "Netfilter nf_tables number generator module" + help + This option adds the number generator expression used to perform + incremental counting and random numbers bound to a upper limit. + config NFT_CT depends on NF_CONNTRACK tristate "Netfilter nf_tables conntrack module" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0fc42df19b8c..0c8581100ac6 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o +obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c new file mode 100644 index 000000000000..176e26d5bbd0 --- /dev/null +++ b/net/netfilter/nft_numgen.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016 Laura Garcia + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); + +struct nft_ng_inc { + enum nft_registers dreg:8; + u32 until; + atomic_t counter; +}; + +static void nft_ng_inc_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + u32 nval, oval; + + do { + oval = atomic_read(&priv->counter); + nval = (oval + 1 < priv->until) ? oval + 1 : 0; + } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); + + memcpy(®s->data[priv->dreg], &priv->counter, sizeof(u32)); +} + +static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { + [NFTA_NG_DREG] = { .type = NLA_U32 }, + [NFTA_NG_UNTIL] = { .type = NLA_U32 }, + [NFTA_NG_TYPE] = { .type = NLA_U32 }, +}; + +static int nft_ng_inc_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + + priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL])); + if (priv->until == 0) + return -ERANGE; + + priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); + atomic_set(&priv->counter, 0); + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + +static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, + u32 until, enum nft_ng_types type) +{ + if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_NG_UNTIL, until)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_NG_TYPE, type)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL); +} + +struct nft_ng_random { + enum nft_registers dreg:8; + u32 until; +}; + +static void nft_ng_random_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_random *priv = nft_expr_priv(expr); + struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); + + regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state), + priv->until); +} + +static int nft_ng_random_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_random *priv = nft_expr_priv(expr); + + priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL])); + if (priv->until == 0) + return -ERANGE; + + prandom_init_once(&nft_numgen_prandom_state); + + priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + +static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ng_random *priv = nft_expr_priv(expr); + + return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM); +} + +static struct nft_expr_type nft_ng_type; +static const struct nft_expr_ops nft_ng_inc_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), + .eval = nft_ng_inc_eval, + .init = nft_ng_inc_init, + .dump = nft_ng_inc_dump, +}; + +static const struct nft_expr_ops nft_ng_random_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), + .eval = nft_ng_random_eval, + .init = nft_ng_random_init, + .dump = nft_ng_random_dump, +}; + +static const struct nft_expr_ops * +nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) +{ + u32 type; + + if (!tb[NFTA_NG_DREG] || + !tb[NFTA_NG_UNTIL] || + !tb[NFTA_NG_TYPE]) + return ERR_PTR(-EINVAL); + + type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE])); + + switch (type) { + case NFT_NG_INCREMENTAL: + return &nft_ng_inc_ops; + case NFT_NG_RANDOM: + return &nft_ng_random_ops; + } + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_ng_type __read_mostly = { + .name = "numgen", + .select_ops = &nft_ng_select_ops, + .policy = nft_ng_policy, + .maxattr = NFTA_NG_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_ng_module_init(void) +{ + return nft_register_expr(&nft_ng_type); +} + +static void __exit nft_ng_module_exit(void) +{ + nft_unregister_expr(&nft_ng_type); +} + +module_init(nft_ng_module_init); +module_exit(nft_ng_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laura Garcia "); +MODULE_ALIAS_NFT_EXPR("numgen"); -- cgit v1.2.3 From b9a24bb76bf611a5268ceffe04219e6ad264559b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 19 Aug 2016 12:36:54 -0700 Subject: net_sched: properly handle failure case of tcf_exts_init() After commit 22dc13c837c3 ("net_sched: convert tcf_exts from list to pointer array") we do dynamic allocation in tcf_exts_init(), therefore we need to handle the ENOMEM case properly. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 ++-- net/sched/cls_basic.c | 12 +++++-- net/sched/cls_bpf.c | 27 +++++++++------ net/sched/cls_cgroup.c | 13 +++++-- net/sched/cls_flow.c | 26 ++++++++------ net/sched/cls_flower.c | 11 ++++-- net/sched/cls_fw.c | 18 +++++++--- net/sched/cls_route.c | 14 +++++--- net/sched/cls_rsvp.h | 17 +++++++--- net/sched/cls_tcindex.c | 90 +++++++++++++++++++++++++++++++++++-------------- net/sched/cls_u32.c | 21 ++++++++---- 11 files changed, 181 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c99508d426cc..a459be5fe1c2 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -69,17 +69,19 @@ struct tcf_exts { int police; }; -static inline void tcf_exts_init(struct tcf_exts *exts, int action, int police) +static inline int tcf_exts_init(struct tcf_exts *exts, int action, int police) { #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), GFP_KERNEL); - WARN_ON(!exts->actions); /* TODO: propagate the error to callers */ + if (!exts->actions) + return -ENOMEM; #endif exts->action = action; exts->police = police; + return 0; } /** diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 0b8c3ace671f..eb219b78cd49 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -138,10 +138,12 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; struct tcf_ematch_tree t; - tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); if (err < 0) @@ -189,7 +191,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + if (err < 0) + goto errout; + err = -EINVAL; if (handle) { fnew->handle = handle; @@ -226,6 +231,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index c3002c2c68bb..4742f415ee5b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -311,17 +311,19 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + ret = tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) return ret; + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + goto errout; if (tb[TCA_BPF_FLAGS]) { u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { - tcf_exts_destroy(&exts); - return -EINVAL; + ret = -EINVAL; + goto errout; } have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; @@ -331,10 +333,8 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, tp); - if (ret < 0) { - tcf_exts_destroy(&exts); - return ret; - } + if (ret < 0) + goto errout; if (tb[TCA_BPF_CLASSID]) { prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); @@ -343,6 +343,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &prog->exts, &exts); return 0; + +errout: + tcf_exts_destroy(&exts); + return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -388,7 +392,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!prog) return -ENOBUFS; - tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + if (ret < 0) + goto errout; if (oldprog) { if (handle && oldprog->handle != handle) { @@ -420,9 +426,10 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) prog; return 0; + errout: + tcf_exts_destroy(&prog->exts); kfree(prog); - return ret; } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 4c85bd3a750c..85233c470035 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -93,7 +93,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOBUFS; - tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + if (err < 0) + goto errout; new->handle = handle; new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], @@ -101,10 +103,14 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); if (err < 0) goto errout; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) { + tcf_exts_destroy(&e); + goto errout; + } err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); if (err < 0) { @@ -120,6 +126,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, call_rcu(&head->rcu, cls_cgroup_destroy_rcu); return 0; errout: + tcf_exts_destroy(&new->exts); kfree(new); return err; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index fbfec6a18839..2c1ae549edbf 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -418,10 +418,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; } - tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + if (err < 0) + goto err1; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) - return err; + goto err1; err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); if (err < 0) @@ -432,13 +434,15 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (!fnew) goto err2; - tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + if (err < 0) + goto err3; fold = (struct flow_filter *)*arg; if (fold) { err = -EINVAL; if (fold->handle != handle && handle) - goto err2; + goto err3; /* Copy fold into fnew */ fnew->tp = fold->tp; @@ -458,31 +462,31 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) - goto err2; + goto err3; if (mode == FLOW_MODE_HASH) perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) - goto err2; + goto err3; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } } else { err = -EINVAL; if (!handle) - goto err2; + goto err3; if (!tb[TCA_FLOW_KEYS]) - goto err2; + goto err3; mode = FLOW_MODE_MAP; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) - goto err2; + goto err3; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) - goto err2; + goto err3; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } @@ -542,6 +546,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, call_rcu(&fold->rcu, flow_destroy_filter); return 0; +err3: + tcf_exts_destroy(&fnew->exts); err2: tcf_em_tree_destroy(&t); kfree(fnew); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1e11e57e6947..532ab6751343 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -513,10 +513,12 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; int err; - tcf_exts_init(&e, TCA_FLOWER_ACT, 0); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_FLOWER_ACT, 0); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); @@ -585,7 +587,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + if (err < 0) + goto errout; if (!handle) { handle = fl_grab_new_handle(tp, head); @@ -649,6 +653,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index f23a3b68bba6..cc0bda945800 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -195,10 +195,12 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, u32 mask; int err; - tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + goto errout; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); @@ -270,10 +272,15 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, #endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; - tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + if (err < 0) { + kfree(fnew); + return err; + } err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); if (err < 0) { + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } @@ -313,7 +320,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) return -ENOBUFS; - tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + if (err < 0) + goto errout; f->id = handle; f->tp = tp; @@ -328,6 +337,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&f->exts); kfree(f); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 08a3b0a6f5ab..c91e65d81a48 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -383,17 +383,19 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *est, int new, bool ovr) { - int err; u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; unsigned int h1; struct route4_bucket *b; struct tcf_exts e; + int err; - tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -EINVAL; if (tb[TCA_ROUTE4_TO]) { @@ -503,7 +505,10 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (!f) goto errout; - tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + if (err < 0) + goto errout; + if (fold) { f->id = fold->id; f->iif = fold->iif; @@ -557,6 +562,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&f->exts); kfree(f); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index f9c9fc075fe6..4f05a19fb073 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -487,10 +487,12 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + goto errout2; f = (struct rsvp_filter *)*arg; if (f) { @@ -506,7 +508,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, goto errout2; } - tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (err < 0) { + kfree(n); + goto errout2; + } if (tb[TCA_RSVP_CLASSID]) { n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); @@ -530,7 +536,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout2; - tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (err < 0) + goto errout; h2 = 16; if (tb[TCA_RSVP_SRC]) { memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -627,6 +635,7 @@ insert: goto insert; errout: + tcf_exts_destroy(&f->exts); kfree(f); errout2: tcf_exts_destroy(&e); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 944c8ff45055..d9500709831f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -219,10 +219,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; -static void tcindex_filter_result_init(struct tcindex_filter_result *r) +static int tcindex_filter_result_init(struct tcindex_filter_result *r) { memset(r, 0, sizeof(*r)); - tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } static void __tcindex_partial_destroy(struct rcu_head *head) @@ -233,23 +233,57 @@ static void __tcindex_partial_destroy(struct rcu_head *head) kfree(p); } +static void tcindex_free_perfect_hash(struct tcindex_data *cp) +{ + int i; + + for (i = 0; i < cp->hash; i++) + tcf_exts_destroy(&cp->perfect[i].exts); + kfree(cp->perfect); +} + +static int tcindex_alloc_perfect_hash(struct tcindex_data *cp) +{ + int i, err = 0; + + cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), + GFP_KERNEL); + if (!cp->perfect) + return -ENOMEM; + + for (i = 0; i < cp->hash; i++) { + err = tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + if (err < 0) + goto errout; + } + + return 0; + +errout: + tcindex_free_perfect_hash(cp); + return err; +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, struct nlattr *est, bool ovr) { - int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data *cp, *oldp; + struct tcindex_data *cp = NULL, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ + int err, balloc = 0; struct tcf_exts e; - tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -ENOMEM; /* tcindex_data attributes must look atomic to classifier/lookup so @@ -270,19 +304,20 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (p->perfect) { int i; - cp->perfect = kmemdup(p->perfect, - sizeof(*r) * cp->hash, GFP_KERNEL); - if (!cp->perfect) + if (tcindex_alloc_perfect_hash(cp) < 0) goto errout; for (i = 0; i < cp->hash; i++) - tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + cp->perfect[i].res = p->perfect[i].res; balloc = 1; } cp->h = p->h; - tcindex_filter_result_init(&new_filter_result); - tcindex_filter_result_init(&cr); + err = tcindex_filter_result_init(&new_filter_result); + if (err < 0) + goto errout1; + err = tcindex_filter_result_init(&cr); + if (err < 0) + goto errout1; if (old_r) cr.res = r->res; @@ -338,15 +373,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = -ENOMEM; if (!cp->perfect && !cp->h) { if (valid_perfect_hash(cp)) { - int i; - - cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); - if (!cp->perfect) + if (tcindex_alloc_perfect_hash(cp) < 0) goto errout_alloc; - for (i = 0; i < cp->hash; i++) - tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); balloc = 1; } else { struct tcindex_filter __rcu **hash; @@ -373,8 +401,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (!f) goto errout_alloc; f->key = handle; - tcindex_filter_result_init(&f->result); f->next = NULL; + err = tcindex_filter_result_init(&f->result); + if (err < 0) { + kfree(f); + goto errout_alloc; + } } if (tb[TCA_TCINDEX_CLASSID]) { @@ -387,8 +419,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - if (old_r && old_r != r) - tcindex_filter_result_init(old_r); + if (old_r && old_r != r) { + err = tcindex_filter_result_init(old_r); + if (err < 0) { + kfree(f); + goto errout_alloc; + } + } oldp = p; r->res = cr.res; @@ -415,9 +452,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, errout_alloc: if (balloc == 1) - kfree(cp->perfect); + tcindex_free_perfect_hash(cp); else if (balloc == 2) kfree(cp->h); +errout1: + tcf_exts_destroy(&cr.exts); + tcf_exts_destroy(&new_filter_result.exts); errout: kfree(cp); tcf_exts_destroy(&e); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ffe593efe930..a29263a9d8c1 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -709,13 +709,15 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, bool ovr) { - int err; struct tcf_exts e; + int err; - tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -EINVAL; if (tb[TCA_U32_LINK]) { @@ -833,7 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, new->tp = tp; memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); - tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); + if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { + kfree(new); + return NULL; + } return new; } @@ -985,9 +990,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = flags; - tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); n->tp = tp; + err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); + if (err < 0) + goto errout; + #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); if (!n->pcpu_success) { @@ -1028,9 +1036,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, errhw: #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); -errout: #endif +errout: + tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF free_percpu(n->pf); #endif -- cgit v1.2.3 From d8c2c7e3404e5bcaeae4af78d6935e5b8fcc97ee Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 22 Aug 2016 13:25:11 +0300 Subject: qed*: Add support for VFs over legacy PFs Modern VFs can't run on old non-compatible as the fastpath HSI is slightly changed - but as the HSI is actually very close [basically, a single bit whose meaning flipped] this can be supported with small modifications. The major differences would be in: - Recognizing that VF is running on top of a legacy PF. - Returning some slowpath configurations that are no longer needed on top of modern PFs, but would be required when working over the legacy ones. Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_l2.c | 2 + drivers/net/ethernet/qlogic/qed/qed_vf.c | 107 ++++++++++++++++++++++----- drivers/net/ethernet/qlogic/qed/qed_vf.h | 5 ++ drivers/net/ethernet/qlogic/qede/qede.h | 2 + drivers/net/ethernet/qlogic/qede/qede_main.c | 10 +++ include/linux/qed/qed_eth_if.h | 3 + 6 files changed, 109 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index bf433016551a..4409ea3f7d40 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -1685,6 +1685,8 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev, qed_vf_get_num_vlan_filters(&cdev->hwfns[0], &info->num_vlan_filters); qed_vf_get_port_mac(&cdev->hwfns[0], info->port_mac); + + info->is_legacy = !!cdev->hwfns[0].vf_iov_info->b_pre_fp_hsi; } qed_fill_dev_info(cdev, &info->common); diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 9b780b31b15c..f9f68da8b277 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -191,6 +191,9 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn) DP_VERBOSE(p_hwfn, QED_MSG_IOV, "attempting to acquire resources\n"); + /* Clear response buffer, as this might be a re-send */ + memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs)); + /* send acquire request */ rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp)); if (rc) @@ -205,9 +208,12 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn) /* PF agrees to allocate our resources */ if (!(resp->pfdev_info.capabilities & PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE)) { - DP_INFO(p_hwfn, - "PF is using old incompatible driver; Either downgrade driver or request provider to update hypervisor version\n"); - return -EINVAL; + /* It's possible legacy PF mistakenly accepted; + * but we don't care - simply mark it as + * legacy and continue. + */ + req->vfdev_info.capabilities |= + VFPF_ACQUIRE_CAP_PRE_FP_HSI; } DP_VERBOSE(p_hwfn, QED_MSG_IOV, "resources acquired\n"); resources_acquired = true; @@ -215,27 +221,55 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn) attempts < VF_ACQUIRE_THRESH) { qed_vf_pf_acquire_reduce_resc(p_hwfn, p_resc, &resp->resc); + } else if (resp->hdr.status == PFVF_STATUS_NOT_SUPPORTED) { + if (pfdev_info->major_fp_hsi && + (pfdev_info->major_fp_hsi != ETH_HSI_VER_MAJOR)) { + DP_NOTICE(p_hwfn, + "PF uses an incompatible fastpath HSI %02x.%02x [VF requires %02x.%02x]. Please change to a VF driver using %02x.xx.\n", + pfdev_info->major_fp_hsi, + pfdev_info->minor_fp_hsi, + ETH_HSI_VER_MAJOR, + ETH_HSI_VER_MINOR, + pfdev_info->major_fp_hsi); + rc = -EINVAL; + goto exit; + } - /* Clear response buffer */ - memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs)); - } else if ((resp->hdr.status == PFVF_STATUS_NOT_SUPPORTED) && - pfdev_info->major_fp_hsi && - (pfdev_info->major_fp_hsi != ETH_HSI_VER_MAJOR)) { - DP_NOTICE(p_hwfn, - "PF uses an incompatible fastpath HSI %02x.%02x [VF requires %02x.%02x]. Please change to a VF driver using %02x.xx.\n", - pfdev_info->major_fp_hsi, - pfdev_info->minor_fp_hsi, - ETH_HSI_VER_MAJOR, - ETH_HSI_VER_MINOR, pfdev_info->major_fp_hsi); - return -EINVAL; + if (!pfdev_info->major_fp_hsi) { + if (req->vfdev_info.capabilities & + VFPF_ACQUIRE_CAP_PRE_FP_HSI) { + DP_NOTICE(p_hwfn, + "PF uses very old drivers. Please change to a VF driver using no later than 8.8.x.x.\n"); + rc = -EINVAL; + goto exit; + } else { + DP_INFO(p_hwfn, + "PF is old - try re-acquire to see if it supports FW-version override\n"); + req->vfdev_info.capabilities |= + VFPF_ACQUIRE_CAP_PRE_FP_HSI; + continue; + } + } + + /* If PF/VF are using same Major, PF must have had + * it's reasons. Simply fail. + */ + DP_NOTICE(p_hwfn, "PF rejected acquisition by VF\n"); + rc = -EINVAL; + goto exit; } else { DP_ERR(p_hwfn, "PF returned error %d to VF acquisition request\n", resp->hdr.status); - return -EAGAIN; + rc = -EAGAIN; + goto exit; } } + /* Mark the PF as legacy, if needed */ + if (req->vfdev_info.capabilities & VFPF_ACQUIRE_CAP_PRE_FP_HSI) + p_iov->b_pre_fp_hsi = true; + /* Update bulletin board size with response from PF */ p_iov->bulletin.size = resp->bulletin_size; @@ -253,14 +287,16 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn) } } - if (ETH_HSI_VER_MINOR && + if (!p_iov->b_pre_fp_hsi && + ETH_HSI_VER_MINOR && (resp->pfdev_info.minor_fp_hsi < ETH_HSI_VER_MINOR)) { DP_INFO(p_hwfn, "PF is using older fastpath HSI; %02x.%02x is configured\n", ETH_HSI_VER_MAJOR, resp->pfdev_info.minor_fp_hsi); } - return 0; +exit: + return rc; } int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn) @@ -347,6 +383,9 @@ free_p_iov: return -ENOMEM; } +#define TSTORM_QZONE_START PXP_VF_BAR0_START_SDM_ZONE_A +#define MSTORM_QZONE_START(dev) (TSTORM_QZONE_START + \ + (TSTORM_QZONE_SIZE * NUM_OF_L2_QUEUES(dev))) int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn, u8 rx_qid, @@ -374,6 +413,21 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn, req->bd_max_bytes = bd_max_bytes; req->stat_id = -1; + /* If PF is legacy, we'll need to calculate producers ourselves + * as well as clean them. + */ + if (pp_prod && p_iov->b_pre_fp_hsi) { + u8 hw_qid = p_iov->acquire_resp.resc.hw_qid[rx_qid]; + u32 init_prod_val = 0; + + *pp_prod = (u8 __iomem *)p_hwfn->regview + + MSTORM_QZONE_START(p_hwfn->cdev) + + hw_qid * MSTORM_QZONE_SIZE; + + /* Init the rcq, rx bd and rx sge (if valid) producers to 0 */ + __internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32), + (u32 *)(&init_prod_val)); + } /* add list termination tlv */ qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv)); @@ -387,7 +441,7 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn, return -EINVAL; /* Learn the address of the producer from the response */ - if (pp_prod) { + if (pp_prod && !p_iov->b_pre_fp_hsi) { u32 init_prod_val = 0; *pp_prod = (u8 __iomem *)p_hwfn->regview + resp->offset; @@ -470,7 +524,20 @@ int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn, } if (pp_doorbell) { - *pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + resp->offset; + /* Modern PFs provide the actual offsets, while legacy + * provided only the queue id. + */ + if (!p_iov->b_pre_fp_hsi) { + *pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + + resp->offset; + } else { + u8 cid = p_iov->acquire_resp.resc.cid[tx_queue_id]; + u32 db_addr; + + db_addr = qed_db_addr(cid, DQ_DEMS_LEGACY); + *pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + + db_addr; + } DP_VERBOSE(p_hwfn, QED_MSG_IOV, "Txq[0x%02x]: doorbell at %p [offset 0x%08x]\n", diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h index 60a599b579bd..35db7a28aa13 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.h +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h @@ -551,6 +551,11 @@ struct qed_vf_iov { /* we set aside a copy of the acquire response */ struct pfvf_acquire_resp_tlv acquire_resp; + + /* In case PF originates prior to the fp-hsi version comparison, + * this has to be propagated as it affects the fastpath. + */ + bool b_pre_fp_hsi; }; #ifdef CONFIG_QED_SRIOV diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index 32325ca5951f..700b509f7143 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -268,6 +268,8 @@ struct qede_tx_queue { u16 num_tx_buffers; u64 xmit_pkts; u64 stopped_cnt; + + bool is_legacy; }; #define BD_UNMAP_ADDR(bd) HILO_U64(le32_to_cpu((bd)->addr.hi), \ diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index a05459f96962..ac126e6067ae 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -598,6 +598,14 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb, 1 << ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT; } + /* Legacy FW had flipped behavior in regard to this bit - + * I.e., needed to set to prevent FW from touching encapsulated + * packets when it didn't need to. + */ + if (unlikely(txq->is_legacy)) + first_bd->data.bitfields ^= + 1 << ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT; + /* If the packet is IPv6 with extension header, indicate that * to FW and pass few params, since the device cracker doesn't * support parsing IPv6 with extension header/s. @@ -2991,6 +2999,8 @@ static void qede_init_fp(struct qede_dev *edev) for (tc = 0; tc < edev->num_tc; tc++) { txq_index = tc * QEDE_RSS_CNT(edev) + rss_id; fp->txqs[tc].index = txq_index; + if (edev->dev_info.is_legacy) + fp->txqs[tc].is_legacy = true; } snprintf(fp->name, sizeof(fp->name), "%s-fp-%d", diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 4475a9d8ae15..33c24ebc9b7f 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -23,6 +23,9 @@ struct qed_dev_eth_info { u8 port_mac[ETH_ALEN]; u8 num_vlan_filters; + + /* Legacy VF - this affects the datapath, so qede has to know */ + bool is_legacy; }; struct qed_update_vport_rss_params { -- cgit v1.2.3 From 7b314362a2344feaafbdf6aa8f3d57077728e37a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 22 Aug 2016 16:01:01 +0200 Subject: net: dsa: Allow the DSA driver to indicate the tag protocol DSA drivers may drive different families of switches which need different tag protocol. Rather than hard code the tag protocol in the driver structure, have a callback for the DSA core to call. Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 7 ++++++- drivers/net/dsa/bcm_sf2.c | 7 ++++++- drivers/net/dsa/mv88e6060.c | 7 ++++++- drivers/net/dsa/mv88e6xxx/chip.c | 7 ++++++- include/net/dsa.h | 5 +++-- net/dsa/dsa.c | 5 ++++- net/dsa/dsa2.c | 4 +++- 7 files changed, 34 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 38ee10de7884..65ecb51f99e5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1373,8 +1373,13 @@ static void b53_br_set_stp_state(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } +static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds) +{ + return DSA_TAG_PROTO_NONE; +} + static struct dsa_switch_driver b53_switch_ops = { - .tag_protocol = DSA_TAG_PROTO_NONE, + .get_tag_protocol = b53_get_tag_protocol, .setup = b53_setup, .set_addr = b53_set_addr, .get_strings = b53_get_strings, diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 8e6fe13dbec3..b47a74b37a42 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -136,6 +136,11 @@ static int bcm_sf2_sw_get_sset_count(struct dsa_switch *ds) return BCM_SF2_STATS_SIZE; } +static enum dsa_tag_protocol bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds) +{ + return DSA_TAG_PROTO_BRCM; +} + static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) { struct bcm_sf2_priv *priv = ds_to_priv(ds); @@ -1577,8 +1582,8 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds) } static struct dsa_switch_driver bcm_sf2_switch_driver = { - .tag_protocol = DSA_TAG_PROTO_BRCM, .setup = bcm_sf2_sw_setup, + .get_tag_protocol = bcm_sf2_sw_get_tag_protocol, .set_addr = bcm_sf2_sw_set_addr, .get_phy_flags = bcm_sf2_sw_get_phy_flags, .get_strings = bcm_sf2_sw_get_strings, diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index e36b40886bd8..1fdfbf3a50bc 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -69,6 +69,11 @@ static const char *mv88e6060_get_name(struct mii_bus *bus, int sw_addr) return NULL; } +static enum dsa_tag_protocol mv88e6060_get_tag_protocol(struct dsa_switch *ds) +{ + return DSA_TAG_PROTO_TRAILER; +} + static const char *mv88e6060_drv_probe(struct device *dsa_dev, struct device *host_dev, int sw_addr, void **_priv) @@ -248,7 +253,7 @@ mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) } static struct dsa_switch_driver mv88e6060_switch_driver = { - .tag_protocol = DSA_TAG_PROTO_TRAILER, + .get_tag_protocol = mv88e6060_get_tag_protocol, .probe = mv88e6060_drv_probe, .setup = mv88e6060_setup, .set_addr = mv88e6060_set_addr, diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 014b52bd72f1..63cad6c00bc7 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3924,6 +3924,11 @@ static int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip, return 0; } +static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds) +{ + return DSA_TAG_PROTO_EDSA; +} + static const char *mv88e6xxx_drv_probe(struct device *dsa_dev, struct device *host_dev, int sw_addr, void **priv) @@ -3967,8 +3972,8 @@ free: } static struct dsa_switch_driver mv88e6xxx_switch_driver = { - .tag_protocol = DSA_TAG_PROTO_EDSA, .probe = mv88e6xxx_drv_probe, + .get_tag_protocol = mv88e6xxx_get_tag_protocol, .setup = mv88e6xxx_setup, .set_addr = mv88e6xxx_set_addr, .adjust_link = mv88e6xxx_adjust_link, diff --git a/include/net/dsa.h b/include/net/dsa.h index d00c392bc9f8..8ca2684c5358 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -239,14 +239,15 @@ struct switchdev_obj_port_vlan; struct dsa_switch_driver { struct list_head list; - enum dsa_tag_protocol tag_protocol; - /* * Probing and setup. */ const char *(*probe)(struct device *dsa_dev, struct device *host_dev, int sw_addr, void **priv); + + enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); + int (*setup)(struct dsa_switch *ds); int (*set_addr)(struct dsa_switch *ds, u8 *addr); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 8bda74e595a5..8d3a28d4e99d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -354,7 +354,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); + enum dsa_tag_protocol tag_protocol; + + tag_protocol = drv->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { ret = PTR_ERR(dst->tag_ops); goto out; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f30bad9678f0..2e343221464c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -443,6 +443,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -465,7 +466,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, dst->cpu_port = index; } - dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); + tag_protocol = ds->drv->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); return PTR_ERR(dst->tag_ops); -- cgit v1.2.3 From f1ff8666ed87b0013e45ce2d335085407bb38a60 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Tue, 23 Aug 2016 07:19:50 +0300 Subject: qed: Fix address macros Last FW submission reverted various macros into an older form, where they generate compilation warnings on some architectures. Bring back the newer macros instead. Fixes: 05fafbfb3d77 ("qed: utilize FW 8.10.10.0") Reported-by: kbuild test robot Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index d306e0b55581..70b30e4d3cc4 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -13,23 +13,17 @@ #include /* dma_addr_t manip */ -#define DMA_LO(x) ((u32)(((dma_addr_t)(x)) & 0xffffffff)) -#define DMA_HI(x) ((u32)(((dma_addr_t)(x)) >> 32)) - -#define DMA_LO_LE(x) cpu_to_le32(DMA_LO(x)) -#define DMA_HI_LE(x) cpu_to_le32(DMA_HI(x)) - -/* It's assumed that whoever includes this has previously included an hsi - * file defining the regpair. - */ -#define DMA_REGPAIR_LE(x, val) (x).hi = DMA_HI_LE((val)); \ - (x).lo = DMA_LO_LE((val)) +#define DMA_LO_LE(x) cpu_to_le32(lower_32_bits(x)) +#define DMA_HI_LE(x) cpu_to_le32(upper_32_bits(x)) +#define DMA_REGPAIR_LE(x, val) do { \ + (x).hi = DMA_HI_LE((val)); \ + (x).lo = DMA_LO_LE((val)); \ + } while (0) #define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) -#define HILO_DMA(hi, lo) HILO_GEN(hi, lo, dma_addr_t) -#define HILO_64(hi, lo) HILO_GEN(hi, lo, u64) -#define HILO_DMA_REGPAIR(regpair) (HILO_DMA(regpair.hi, regpair.lo)) +#define HILO_64(hi, lo) HILO_GEN((le32_to_cpu(hi)), (le32_to_cpu(lo)), u64) #define HILO_64_REGPAIR(regpair) (HILO_64(regpair.hi, regpair.lo)) +#define HILO_DMA_REGPAIR(regpair) ((dma_addr_t)HILO_64_REGPAIR(regpair)) #ifndef __COMMON_HSI__ #define __COMMON_HSI__ -- cgit v1.2.3 From b43f95695124baba9f7c48e247fc8fd212a984d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Mon, 22 Aug 2016 11:37:36 +0200 Subject: netfilter: nf_tables: typo in trace attribute definition Should be attributes, instead of attibutes, for consistency with other definitions. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8c9d6ff70ec0..8a63f22b365d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1090,7 +1090,7 @@ enum nft_gen_attributes { * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) */ -enum nft_trace_attibutes { +enum nft_trace_attributes { NFTA_TRACE_UNSPEC, NFTA_TRACE_TABLE, NFTA_TRACE_CHAIN, -- cgit v1.2.3 From df844fd46b98c2efde8f4ac2d50d59bc90c4c679 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Aug 2016 15:27:24 +0100 Subject: rxrpc: Use a tracepoint for skb accounting debugging Use a tracepoint to log various skb accounting points to help in debugging refcounting errors. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 56 +++++++++++++++++++++++++++++++++++++++ net/rxrpc/af_rxrpc.c | 1 + net/rxrpc/ar-internal.h | 45 +++++--------------------------- net/rxrpc/call_accept.c | 1 + net/rxrpc/call_event.c | 3 +++ net/rxrpc/conn_event.c | 2 ++ net/rxrpc/local_event.c | 1 + net/rxrpc/output.c | 1 + net/rxrpc/recvmsg.c | 1 + net/rxrpc/skbuff.c | 62 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 135 insertions(+), 38 deletions(-) create mode 100644 include/trace/events/rxrpc.h (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h new file mode 100644 index 000000000000..15283ee3e41a --- /dev/null +++ b/include/trace/events/rxrpc.h @@ -0,0 +1,56 @@ +/* AF_RXRPC tracepoints + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rxrpc + +#if !defined(_TRACE_RXRPC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RXRPC_H + +#include + +TRACE_EVENT(rxrpc_skb, + TP_PROTO(struct sk_buff *skb, int op, int usage, int mod_count, + const void *where), + + TP_ARGS(skb, op, usage, mod_count, where), + + TP_STRUCT__entry( + __field(struct sk_buff *, skb ) + __field(int, op ) + __field(int, usage ) + __field(int, mod_count ) + __field(const void *, where ) + ), + + TP_fast_assign( + __entry->skb = skb; + __entry->op = op; + __entry->usage = usage; + __entry->mod_count = mod_count; + __entry->where = where; + ), + + TP_printk("s=%p %s u=%d m=%d p=%pSR", + __entry->skb, + (__entry->op == 0 ? "NEW" : + __entry->op == 1 ? "SEE" : + __entry->op == 2 ? "GET" : + __entry->op == 3 ? "FRE" : + "PUR"), + __entry->usage, + __entry->mod_count, + __entry->where) + ); + +#endif /* _TRACE_RXRPC_H */ + +/* This part must be outside protection */ +#include diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 88effadd4b16..c7cf356b42b8 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -22,6 +22,7 @@ #include #include #include +#define CREATE_TRACE_POINTS #include "ar-internal.h" MODULE_DESCRIPTION("RxRPC network protocol"); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 648060a5df35..8cb517fbbd23 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -479,6 +479,8 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) write_unlock_bh(&call->state_lock); } +#include + /* * af_rxrpc.c */ @@ -752,6 +754,11 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *); * skbuff.c */ void rxrpc_packet_destructor(struct sk_buff *); +void rxrpc_new_skb(struct sk_buff *); +void rxrpc_see_skb(struct sk_buff *); +void rxrpc_get_skb(struct sk_buff *); +void rxrpc_free_skb(struct sk_buff *); +void rxrpc_purge_queue(struct sk_buff_head *); /* * sysctl.c @@ -899,44 +906,6 @@ do { \ #endif /* __KDEBUGALL */ -/* - * socket buffer accounting / leak finding - */ -static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn) -{ - //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_inc(&rxrpc_n_skbs); -} - -#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__) - -static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn) -{ - //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_dec(&rxrpc_n_skbs); -} - -#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__) - -static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn) -{ - if (skb) { - CHECK_SLAB_OKAY(&skb->users); - //_net("free skb %p %s [%d]", - // skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_dec(&rxrpc_n_skbs); - kfree_skb(skb); - } -} - -#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__) - -static inline void rxrpc_purge_queue(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb = skb_dequeue((list))) != NULL) - rxrpc_free_skb(skb); -} #define rxrpc_get_call(CALL) \ do { \ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9bae21e66d65..669ac79d3b44 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -203,6 +203,7 @@ void rxrpc_accept_incoming_calls(struct rxrpc_local *local) _net("incoming call skb %p", skb); + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); /* Set up a response packet header in case we need it */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index eaa8035dcb71..3d1267cea9ea 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -407,6 +407,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) skb = skb_dequeue(&call->rx_oos_queue); if (skb) { + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); _debug("drain OOS packet %d [%d]", @@ -427,6 +428,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) /* find out what the next packet is */ skb = skb_peek(&call->rx_oos_queue); + rxrpc_see_skb(skb); if (skb) call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; else @@ -576,6 +578,7 @@ process_further: if (!skb) return -EAGAIN; + rxrpc_see_skb(skb); _net("deferred skb %p", skb); sp = rxrpc_skb(skb); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index cee0f35bc1cf..c631d926f4db 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -277,6 +277,7 @@ void rxrpc_process_connection(struct work_struct *work) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { + rxrpc_see_skb(skb); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -365,6 +366,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); switch (sa.sa.sa_family) { case AF_INET: diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 31a3f86ef2f6..bcc6593b4cdb 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -93,6 +93,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_see_skb(skb); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 9e626f1e2668..e3a08d542fb7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -548,6 +548,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, skb = call->tx_pending; call->tx_pending = NULL; + rxrpc_see_skb(skb); copied = 0; do { diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 9ed66d533002..b964c2d49a88 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -111,6 +111,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } peek_next_packet: + rxrpc_see_skb(skb); sp = rxrpc_skb(skb); call = sp->call; ASSERT(call != NULL); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 06c51d4b622d..d28058a97bc1 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -163,3 +163,65 @@ void rxrpc_kernel_free_skb(struct sk_buff *skb) rxrpc_free_skb(skb); } EXPORT_SYMBOL(rxrpc_kernel_free_skb); + +/* + * Note the existence of a new-to-us socket buffer (allocated or dequeued). + */ +void rxrpc_new_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here); +} + +/* + * Note the re-emergence of a socket buffer from a queue or buffer. + */ +void rxrpc_see_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n = atomic_read(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here); + } +} + +/* + * Note the addition of a ref on a socket buffer. + */ +void rxrpc_get_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here); + skb_get(skb); +} + +/* + * Note the destruction of a socket buffer. + */ +void rxrpc_free_skb(struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n; + CHECK_SLAB_OKAY(&skb->users); + n = atomic_dec_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } +} + +/* + * Clear a queue of socket buffers. + */ +void rxrpc_purge_queue(struct sk_buff_head *list) +{ + const void *here = __builtin_return_address(0); + struct sk_buff *skb; + while ((skb = skb_dequeue((list))) != NULL) { + int n = atomic_dec_return(&rxrpc_n_skbs); + trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } +} -- cgit v1.2.3 From 30d1de08c87ddde6f73936c3350e7e153988fe02 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Aug 2016 12:17:48 -0700 Subject: hv_netvsc: make inline functions static Several new functions were introduced into hyperv.h but only used in one file. Move them and let compiler decide on inline. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 85 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/hyperv.h | 84 -------------------------------------------- 2 files changed, 84 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index fe83de33895d..4ef2af63419f 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -33,6 +33,89 @@ #include "hyperv_net.h" +/* + * An API to support in-place processing of incoming VMBUS packets. + */ +#define VMBUS_PKT_TRAILER 8 + +static struct vmpacket_descriptor * +get_next_pkt_raw(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + u32 read_loc = ring_info->priv_read_index; + void *ring_buffer = hv_get_ring_buffer(ring_info); + struct vmpacket_descriptor *cur_desc; + u32 packetlen; + u32 dsize = ring_info->ring_datasize; + u32 delta = read_loc - ring_info->ring_buffer->read_index; + u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); + + if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) + return NULL; + + if ((read_loc + sizeof(*cur_desc)) > dsize) + return NULL; + + cur_desc = ring_buffer + read_loc; + packetlen = cur_desc->len8 << 3; + + /* + * If the packet under consideration is wrapping around, + * return failure. + */ + if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1)) + return NULL; + + return cur_desc; +} + +/* + * A helper function to step through packets "in-place" + * This API is to be called after each successful call + * get_next_pkt_raw(). + */ +static void put_pkt_raw(struct vmbus_channel *channel, + struct vmpacket_descriptor *desc) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + u32 read_loc = ring_info->priv_read_index; + u32 packetlen = desc->len8 << 3; + u32 dsize = ring_info->ring_datasize; + + BUG_ON((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize); + + /* + * Include the packet trailer. + */ + ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; +} + +/* + * This call commits the read index and potentially signals the host. + * Here is the pattern for using the "in-place" consumption APIs: + * + * while (get_next_pkt_raw() { + * process the packet "in-place"; + * put_pkt_raw(); + * } + * if (packets processed in place) + * commit_rd_index(); + */ +static void commit_rd_index(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + /* + * Make sure all reads are done before we update the read index since + * the writer may start writing to the read area once the read index + * is updated. + */ + virt_rmb(); + ring_info->ring_buffer->read_index = ring_info->priv_read_index; + + if (hv_need_to_signal_on_read(ring_info)) + vmbus_set_event(channel); +} + /* * Switch the data path from the synthetic interface to the VF * interface. @@ -749,7 +832,7 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device, return msg_size; } -static inline int netvsc_send_pkt( +static int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, struct netvsc_device *net_device, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b10954a66939..a6bc974def8f 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1422,88 +1422,4 @@ static inline bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi) return false; } -/* - * An API to support in-place processing of incoming VMBUS packets. - */ -#define VMBUS_PKT_TRAILER 8 - -static inline struct vmpacket_descriptor * -get_next_pkt_raw(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 read_loc = ring_info->priv_read_index; - void *ring_buffer = hv_get_ring_buffer(ring_info); - struct vmpacket_descriptor *cur_desc; - u32 packetlen; - u32 dsize = ring_info->ring_datasize; - u32 delta = read_loc - ring_info->ring_buffer->read_index; - u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); - - if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) - return NULL; - - if ((read_loc + sizeof(*cur_desc)) > dsize) - return NULL; - - cur_desc = ring_buffer + read_loc; - packetlen = cur_desc->len8 << 3; - - /* - * If the packet under consideration is wrapping around, - * return failure. - */ - if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1)) - return NULL; - - return cur_desc; -} - -/* - * A helper function to step through packets "in-place" - * This API is to be called after each successful call - * get_next_pkt_raw(). - */ -static inline void put_pkt_raw(struct vmbus_channel *channel, - struct vmpacket_descriptor *desc) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 read_loc = ring_info->priv_read_index; - u32 packetlen = desc->len8 << 3; - u32 dsize = ring_info->ring_datasize; - - if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize) - BUG(); - /* - * Include the packet trailer. - */ - ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; -} - -/* - * This call commits the read index and potentially signals the host. - * Here is the pattern for using the "in-place" consumption APIs: - * - * while (get_next_pkt_raw() { - * process the packet "in-place"; - * put_pkt_raw(); - * } - * if (packets processed in place) - * commit_rd_index(); - */ -static inline void commit_rd_index(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - /* - * Make sure all reads are done before we update the read index since - * the writer may start writing to the read area once the read index - * is updated. - */ - virt_rmb(); - ring_info->ring_buffer->read_index = ring_info->priv_read_index; - - if (hv_need_to_signal_on_read(ring_info)) - vmbus_set_event(channel); -} - - #endif /* _HYPERV_H */ -- cgit v1.2.3 From e3f74b841d482e962b9f5a907eeb25eeeb09aa60 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Aug 2016 12:17:56 -0700 Subject: hv_netvsc: report vmbus name in ethtool Make netvsc on vmbus behave more like PCI. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 4 ++++ include/linux/hyperv.h | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 7ed9f13067b5..b874ab1317c6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -734,8 +734,12 @@ vf_injection_done: static void netvsc_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { + struct net_device_context *net_device_ctx = netdev_priv(net); + struct hv_device *dev = net_device_ctx->device_ctx; + strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); + strlcpy(info->bus_info, vmbus_dev_name(dev), sizeof(info->bus_info)); } static void netvsc_get_channels(struct net_device *net, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a6bc974def8f..b01c8c3dd531 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1114,6 +1114,13 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver, const char *mod_name); void vmbus_driver_unregister(struct hv_driver *hv_driver); +static inline const char *vmbus_dev_name(const struct hv_device *device_obj) +{ + const struct kobject *kobj = &device_obj->device.kobj; + + return kobj->name; +} + void vmbus_hvsock_device_unregister(struct vmbus_channel *channel); int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, -- cgit v1.2.3 From cff6a334e63420e95ec40dc7fcdc0b8258615760 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 23 Aug 2016 11:55:30 -0700 Subject: strparser: Queue work when being unpaused When the upper layer unpauses a stream parser connection we need to queue rx_work to make sure no events are missed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/strparser.h | 5 +---- net/strparser/strparser.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index fdb3d6746cc4..91fa0b958426 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -88,10 +88,7 @@ static inline void strp_pause(struct strparser *strp) } /* May be called without holding lock for attached socket */ -static inline void strp_unpause(struct strparser *strp) -{ - strp->rx_paused = 0; -} +void strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 68334b56db1e..4ecfc10cbe6d 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -445,6 +445,17 @@ int strp_init(struct strparser *strp, struct sock *csk, } EXPORT_SYMBOL_GPL(strp_init); +void strp_unpause(struct strparser *strp) +{ + strp->rx_paused = 0; + + /* Sync setting rx_paused with RX work */ + smp_mb(); + + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_unpause); + /* strp must already be stopped so that strp_tcp_recv will no longer be called. * Note that strp_done is not called with the lower socket held. */ -- cgit v1.2.3 From cebc5cbab48f5c58512e26aa1965284354227258 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Aug 2016 17:17:54 -0700 Subject: net-tcp: retire TFO_SERVER_WO_SOCKOPT2 config TFO_SERVER_WO_SOCKOPT2 was intended for debugging purposes during Fast Open development. Remove this config option and also update/clean-up the documentation of the Fast Open sysctl. Reported-by: Piotr Jurkiewicz Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 45 +++++++++++++++++----------------- include/net/tcp.h | 3 +-- net/ipv4/af_inet.c | 21 ++++++---------- 3 files changed, 32 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9ae929395b24..3db8c67d2c8d 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -575,32 +575,33 @@ tcp_syncookies - BOOLEAN unconditionally generation of syncookies. tcp_fastopen - INTEGER - Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data - in the opening SYN packet. To use this feature, the client application - must use sendmsg() or sendto() with MSG_FASTOPEN flag rather than - connect() to perform a TCP handshake automatically. + Enable TCP Fast Open (RFC7413) to send and accept data in the opening + SYN packet. - The values (bitmap) are - 1: Enables sending data in the opening SYN on the client w/ MSG_FASTOPEN. - 2: Enables TCP Fast Open on the server side, i.e., allowing data in - a SYN packet to be accepted and passed to the application before - 3-way hand shake finishes. - 4: Send data in the opening SYN regardless of cookie availability and - without a cookie option. - 0x100: Accept SYN data w/o validating the cookie. - 0x200: Accept data-in-SYN w/o any cookie option present. - 0x400/0x800: Enable Fast Open on all listeners regardless of the - TCP_FASTOPEN socket option. The two different flags designate two - different ways of setting max_qlen without the TCP_FASTOPEN socket - option. + The client support is enabled by flag 0x1 (on by default). The client + then must use sendmsg() or sendto() with the MSG_FASTOPEN flag, + rather than connect() to send data in SYN. - Default: 1 + The server support is enabled by flag 0x2 (off by default). Then + either enable for all listeners with another flag (0x400) or + enable individual listeners via TCP_FASTOPEN socket option with + the option value being the length of the syn-data backlog. - Note that the client & server side Fast Open flags (1 and 2 - respectively) must be also enabled before the rest of flags can take - effect. + The values (bitmap) are + 0x1: (client) enables sending data in the opening SYN on the client. + 0x2: (server) enables the server support, i.e., allowing data in + a SYN packet to be accepted and passed to the + application before 3-way handshake finishes. + 0x4: (client) send data in the opening SYN regardless of cookie + availability and without a cookie option. + 0x200: (server) accept data-in-SYN w/o any cookie option present. + 0x400: (server) enable all listeners to support Fast Open by + default without explicit TCP_FASTOPEN socket option. + + Default: 0x1 - See include/net/tcp.h and the code for more details. + Note that that additional client or server features are only + effective if the basic support (0x1 and 0x2) are enabled respectively. tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e7d51bb18..25d64f6de69e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -227,10 +227,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TFO_SERVER_COOKIE_NOT_REQD 0x200 /* Force enable TFO on all listeners, i.e., not requiring the - * TCP_FASTOPEN socket option. SOCKOPT1/2 determine how to set max_qlen. + * TCP_FASTOPEN socket option. */ #define TFO_SERVER_WO_SOCKOPT1 0x400 -#define TFO_SERVER_WO_SOCKOPT2 0x800 extern struct inet_timewait_death_row tcp_death_row; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55513e654d79..989a362814a9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -211,24 +211,19 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - /* Check special setups for testing purpose to enable TFO w/o - * requiring TCP_FASTOPEN sockopt. + /* Enable TFO w/o requiring TCP_FASTOPEN socket option. * Note that only TCP sockets (SOCK_STREAM) will reach here. - * Also fastopenq may already been allocated because this - * socket was in TCP_LISTEN state previously but was - * shutdown() (rather than close()). + * Also fastopen backlog may already been set via the option + * because the socket was in TCP_LISTEN state previously but + * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && + if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - fastopen_queue_tune(sk, backlog); - else if ((sysctl_tcp_fastopen & - TFO_SERVER_WO_SOCKOPT2) != 0) - fastopen_queue_tune(sk, - ((uint)sysctl_tcp_fastopen) >> 16); - + fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(true); } + err = inet_csk_listen_start(sk, backlog); if (err) goto out; -- cgit v1.2.3 From 5d77dca82839ef016a93ad7acd7058b14d967752 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Aug 2016 21:06:33 -0700 Subject: net: diag: support SOCK_DESTROY for UDP sockets This implements SOCK_DESTROY for UDP sockets similar to what was done for TCP with commit c1e64e298b8ca ("net: diag: Support destroying TCP sockets.") A process with a UDP socket targeted for destroy is awakened and recvmsg fails with ECONNABORTED. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/ipv4/udp.c | 15 ++++++++++ net/ipv4/udp_diag.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/udp.c | 1 + 4 files changed, 96 insertions(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 8894d7144189..ea53a87d880f 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -251,6 +251,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); void udp_err(struct sk_buff *, u32); +int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8f5f7f6026f7..e9ffc27b23d0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2193,6 +2193,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(udp_poll); +int udp_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(udp_abort); + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -2224,6 +2238,7 @@ struct proto udp_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif .clear_sk = sk_prot_clear_portaddr_nulls, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 3d5ccf4b1412..8a9f6e535caa 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -165,12 +165,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_wqueue = sk_wmem_alloc_get(sk); } +#ifdef CONFIG_INET_DIAG_DESTROY +static int __udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req, + struct udp_table *tbl) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + rcu_read_lock(); + + if (req->sdiag_family == AF_INET) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); +#if IS_ENABLED(CONFIG_IPV6) + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); + + else + sk = __udp6_lib_lookup(net, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); + } +#endif + else { + rcu_read_unlock(); + return -EINVAL; + } + + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + + rcu_read_unlock(); + + if (!sk) + return -ENOENT; + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_put(sk); + return -ENOENT; + } + + err = sock_diag_destroy(sk, ECONNABORTED); + + sock_put(sk); + + return err; +} + +static int udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udp_table); +} + +static int udplite_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udplite_table); +} + +#endif + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udp_diag_destroy, +#endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -192,6 +268,9 @@ static const struct inet_diag_handler udplite_diag_handler = { .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udplite_diag_destroy, +#endif }; static int __init udp_diag_init(void) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 81e2f98b958d..16512cf06e73 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1467,6 +1467,7 @@ struct proto udpv6_prot = { .compat_getsockopt = compat_udpv6_getsockopt, #endif .clear_sk = udp_v6_clear_sk, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { -- cgit v1.2.3 From 4cac8204661a6d1a842e47911933f1e90b392c84 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Aug 2016 11:39:27 -0700 Subject: udp: get rid of sk_prot_clear_portaddr_nulls() Since we no longer use SLAB_DESTROY_BY_RCU for UDP, we do not need sk_prot_clear_portaddr_nulls() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 -- net/core/sock.c | 18 ------------------ net/ipv4/udp.c | 1 - net/ipv4/udplite.c | 1 - net/ipv6/udplite.c | 1 - 5 files changed, 23 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2aab9b63bf16..1bc57609f8e1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1242,8 +1242,6 @@ static inline int __sk_prot_rehash(struct sock *sk) return sk->sk_prot->hash(sk); } -void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); - /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..2b09c2967e21 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1315,24 +1315,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) -{ - unsigned long nulls1, nulls2; - - nulls1 = offsetof(struct sock, __sk_common.skc_node.next); - nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); - if (nulls1 > nulls2) - swap(nulls1, nulls2); - - if (nulls1 != 0) - memset((char *)sk, 0, nulls1); - memset((char *)sk + nulls1 + sizeof(void *), 0, - nulls2 - nulls1 - sizeof(void *)); - memset((char *)sk + nulls2 + sizeof(void *), 0, - size - nulls2 - sizeof(void *)); -} -EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); - static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e9ffc27b23d0..f0ebb0bd1e11 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2237,7 +2237,6 @@ struct proto udp_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 3b3efbda48e1..67fc9d96e67d 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -61,7 +61,6 @@ struct proto udplite_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, }; EXPORT_SYMBOL(udplite_prot); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 118057a5b759..5cf0099b86f7 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -56,7 +56,6 @@ struct proto udplitev6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, }; static struct inet_protosw udplite6_protosw = { -- cgit v1.2.3 From ba2489b0e0113f68a25fe7a563842c2b591829d7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Aug 2016 11:39:29 -0700 Subject: net: remove clear_sk() method We no longer use this handler, we can delete it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 1 - net/core/sock.c | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 1bc57609f8e1..c797c57f4d9f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1020,7 +1020,6 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); - void (*clear_sk)(struct sock *sk, int size); /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/net/core/sock.c b/net/core/sock.c index 2b09c2967e21..51a730485649 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1326,12 +1326,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) { - if (prot->clear_sk) - prot->clear_sk(sk, prot->obj_size); - else - sk_prot_clear_nulls(sk, prot->obj_size); - } + if (priority & __GFP_ZERO) + sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); -- cgit v1.2.3 From 35db57bbc4b7ab810bba6e6d6954a0faf5a842cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Aug 2016 16:00:12 +0200 Subject: xfrm: state: remove per-netns gc task After commit 5b8ef3415a21f173 ("xfrm: Remove ancient sleeping when the SA is in acquire state") gc does not need any per-netns data anymore. As far as gc is concerned all state structs are the same, so we can use a global work struct for it. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 -- net/xfrm/xfrm_state.c | 18 +++++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 177ed444d7b2..27bb9633c69d 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -44,8 +44,6 @@ struct netns_xfrm { unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; - struct hlist_head state_gc_list; - struct work_struct state_gc_work; struct list_head policy_all; struct hlist_head *policy_byidx; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1a15b658a79e..ba8bf518ba14 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -31,6 +31,8 @@ #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) +static void xfrm_state_gc_task(struct work_struct *work); + /* Each xfrm_state may be linked to two tables: 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) @@ -41,6 +43,9 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation); +static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); +static HLIST_HEAD(xfrm_state_gc_list); + static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) { return atomic_inc_not_zero(&x->refcnt); @@ -368,13 +373,12 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) static void xfrm_state_gc_task(struct work_struct *work) { - struct net *net = container_of(work, struct net, xfrm.state_gc_work); struct xfrm_state *x; struct hlist_node *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_state_gc_lock); - hlist_move_list(&net->xfrm.state_gc_list, &gc_list); + hlist_move_list(&xfrm_state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); synchronize_rcu(); @@ -515,14 +519,12 @@ EXPORT_SYMBOL(xfrm_state_alloc); void __xfrm_state_destroy(struct xfrm_state *x) { - struct net *net = xs_net(x); - WARN_ON(x->km.state != XFRM_STATE_DEAD); spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &net->xfrm.state_gc_list); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&net->xfrm.state_gc_work); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -2134,8 +2136,6 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_num = 0; INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); - INIT_HLIST_HEAD(&net->xfrm.state_gc_list); - INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task); spin_lock_init(&net->xfrm.xfrm_state_lock); return 0; @@ -2153,7 +2153,7 @@ void xfrm_state_fini(struct net *net) flush_work(&net->xfrm.state_hash_work); xfrm_state_flush(net, IPSEC_PROTO_ANY, false); - flush_work(&net->xfrm.state_gc_work); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); -- cgit v1.2.3 From 9d490b4ee4d7d495a4f4908ea998d2a7355e0807 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 Aug 2016 12:38:56 -0400 Subject: net: dsa: rename switch operations structure Now that the dsa_switch_driver structure contains only function pointers as it is supposed to, rename it to the more appropriate dsa_switch_ops, uniformly to any other operations structure in the kernel. No functional changes here, basically just the result of something like: s/dsa_switch_driver *drv/dsa_switch_ops *ops/g However keep the {un,}register_switch_driver functions and their dsa_switch_drivers list as is, since they represent the -- likely to be deprecated soon -- legacy DSA registration framework. In the meantime, also fix the following checks from checkpatch.pl to make it happy with this patch: CHECK: Comparison to NULL could be written "!ops" #403: FILE: net/dsa/dsa.c:470: + if (ops == NULL) { CHECK: Comparison to NULL could be written "ds->ops->get_strings" #773: FILE: net/dsa/slave.c:697: + if (ds->ops->get_strings != NULL) CHECK: Comparison to NULL could be written "ds->ops->get_ethtool_stats" #824: FILE: net/dsa/slave.c:785: + if (ds->ops->get_ethtool_stats != NULL) CHECK: Comparison to NULL could be written "ds->ops->get_sset_count" #835: FILE: net/dsa/slave.c:798: + if (ds->ops->get_sset_count != NULL) total: 0 errors, 0 warnings, 4 checks, 784 lines checked Signed-off-by: Vivien Didelot Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.txt | 10 +-- drivers/net/dsa/b53/b53_common.c | 4 +- drivers/net/dsa/bcm_sf2.c | 4 +- drivers/net/dsa/mv88e6060.c | 6 +- drivers/net/dsa/mv88e6xxx/chip.c | 8 +- include/net/dsa.h | 10 +-- net/dsa/dsa.c | 70 ++++++++--------- net/dsa/dsa2.c | 16 ++-- net/dsa/slave.c | 146 +++++++++++++++++------------------ 9 files changed, 137 insertions(+), 137 deletions(-) (limited to 'include') diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt index 9d05ed7f7da5..44ed453ccf66 100644 --- a/Documentation/networking/dsa/dsa.txt +++ b/Documentation/networking/dsa/dsa.txt @@ -227,9 +227,9 @@ to address individual switches in the tree. dsa_switch: structure describing a switch device in the tree, referencing a dsa_switch_tree as a backpointer, slave network devices, master network device, -and a reference to the backing dsa_switch_driver +and a reference to the backing dsa_switch_ops -dsa_switch_driver: structure referencing function pointers, see below for a full +dsa_switch_ops: structure referencing function pointers, see below for a full description. Design limitations @@ -357,10 +357,10 @@ regular HWMON devices in /sys/class/hwmon/. Driver development ================== -DSA switch drivers need to implement a dsa_switch_driver structure which will +DSA switch drivers need to implement a dsa_switch_ops structure which will contain the various members described below. -register_switch_driver() registers this dsa_switch_driver in its internal list +register_switch_driver() registers this dsa_switch_ops in its internal list of drivers to probe for. unregister_switch_driver() does the exact opposite. Unless requested differently by setting the priv_size member accordingly, DSA @@ -379,7 +379,7 @@ Switch configuration buses, return a non-NULL string - setup: setup function for the switch, this function is responsible for setting - up the dsa_switch_driver private structure with all it needs: register maps, + up the dsa_switch_ops private structure with all it needs: register maps, interrupts, mutexes, locks etc.. This function is also expected to properly configure the switch to separate all network interfaces from each other, that is, they should be isolated by the switch hardware itself, typically by creating diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 65ecb51f99e5..6fb77cca78bb 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1378,7 +1378,7 @@ static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds) return DSA_TAG_PROTO_NONE; } -static struct dsa_switch_driver b53_switch_ops = { +static struct dsa_switch_ops b53_switch_ops = { .get_tag_protocol = b53_get_tag_protocol, .setup = b53_setup, .set_addr = b53_set_addr, @@ -1618,7 +1618,7 @@ static int b53_switch_init(struct b53_device *dev) dev->vta_regs[1] = chip->vta_regs[1]; dev->vta_regs[2] = chip->vta_regs[2]; dev->jumbo_pm_reg = chip->jumbo_pm_reg; - ds->drv = &b53_switch_ops; + ds->ops = &b53_switch_ops; dev->cpu_port = chip->cpu_port; dev->num_vlans = chip->vlans; dev->num_arl_entries = chip->arl_entries; diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index b47a74b37a42..220e5d1948ca 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1581,7 +1581,7 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds) return 0; } -static struct dsa_switch_driver bcm_sf2_switch_driver = { +static struct dsa_switch_ops bcm_sf2_switch_ops = { .setup = bcm_sf2_sw_setup, .get_tag_protocol = bcm_sf2_sw_get_tag_protocol, .set_addr = bcm_sf2_sw_set_addr, @@ -1632,7 +1632,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) priv = (struct bcm_sf2_priv *)(ds + 1); ds->priv = priv; ds->dev = &pdev->dev; - ds->drv = &bcm_sf2_switch_driver; + ds->ops = &bcm_sf2_switch_ops; dev_set_drvdata(&pdev->dev, ds); diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 1fdfbf3a50bc..7ff9d373a9ee 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -252,7 +252,7 @@ mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) return reg_write(ds, addr, regnum, val); } -static struct dsa_switch_driver mv88e6060_switch_driver = { +static struct dsa_switch_ops mv88e6060_switch_ops = { .get_tag_protocol = mv88e6060_get_tag_protocol, .probe = mv88e6060_drv_probe, .setup = mv88e6060_setup, @@ -263,14 +263,14 @@ static struct dsa_switch_driver mv88e6060_switch_driver = { static int __init mv88e6060_init(void) { - register_switch_driver(&mv88e6060_switch_driver); + register_switch_driver(&mv88e6060_switch_ops); return 0; } module_init(mv88e6060_init); static void __exit mv88e6060_cleanup(void) { - unregister_switch_driver(&mv88e6060_switch_driver); + unregister_switch_driver(&mv88e6060_switch_ops); } module_exit(mv88e6060_cleanup); diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 82d45165803c..750d01d775e0 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3976,7 +3976,7 @@ free: return NULL; } -static struct dsa_switch_driver mv88e6xxx_switch_driver = { +static struct dsa_switch_ops mv88e6xxx_switch_ops = { .probe = mv88e6xxx_drv_probe, .get_tag_protocol = mv88e6xxx_get_tag_protocol, .setup = mv88e6xxx_setup, @@ -4025,7 +4025,7 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip, ds->dev = dev; ds->priv = chip; - ds->drv = &mv88e6xxx_switch_driver; + ds->ops = &mv88e6xxx_switch_ops; dev_set_drvdata(dev, ds); @@ -4118,7 +4118,7 @@ static struct mdio_driver mv88e6xxx_driver = { static int __init mv88e6xxx_init(void) { - register_switch_driver(&mv88e6xxx_switch_driver); + register_switch_driver(&mv88e6xxx_switch_ops); return mdio_driver_register(&mv88e6xxx_driver); } module_init(mv88e6xxx_init); @@ -4126,7 +4126,7 @@ module_init(mv88e6xxx_init); static void __exit mv88e6xxx_cleanup(void) { mdio_driver_unregister(&mv88e6xxx_driver); - unregister_switch_driver(&mv88e6xxx_switch_driver); + unregister_switch_driver(&mv88e6xxx_switch_ops); } module_exit(mv88e6xxx_cleanup); diff --git a/include/net/dsa.h b/include/net/dsa.h index 8ca2684c5358..2ebeba44a461 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -165,9 +165,9 @@ struct dsa_switch { struct dsa_chip_data *cd; /* - * The used switch driver. + * The switch operations. */ - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; /* * An array of which element [a] indicates which port on this @@ -236,7 +236,7 @@ struct switchdev_obj; struct switchdev_obj_port_fdb; struct switchdev_obj_port_vlan; -struct dsa_switch_driver { +struct dsa_switch_ops { struct list_head list; /* @@ -371,8 +371,8 @@ struct dsa_switch_driver { int (*cb)(struct switchdev_obj *obj)); }; -void register_switch_driver(struct dsa_switch_driver *type); -void unregister_switch_driver(struct dsa_switch_driver *type); +void register_switch_driver(struct dsa_switch_ops *type); +void unregister_switch_driver(struct dsa_switch_ops *type); struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); static inline void *ds_to_priv(struct dsa_switch *ds) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 8d3a28d4e99d..d8d267e9a872 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -61,27 +61,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { static DEFINE_MUTEX(dsa_switch_drivers_mutex); static LIST_HEAD(dsa_switch_drivers); -void register_switch_driver(struct dsa_switch_driver *drv) +void register_switch_driver(struct dsa_switch_ops *ops) { mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&drv->list, &dsa_switch_drivers); + list_add_tail(&ops->list, &dsa_switch_drivers); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(register_switch_driver); -void unregister_switch_driver(struct dsa_switch_driver *drv) +void unregister_switch_driver(struct dsa_switch_ops *ops) { mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&drv->list); + list_del_init(&ops->list); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(unregister_switch_driver); -static struct dsa_switch_driver * +static struct dsa_switch_ops * dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, const char **_name, void **priv) { - struct dsa_switch_driver *ret; + struct dsa_switch_ops *ret; struct list_head *list; const char *name; @@ -90,13 +90,13 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, mutex_lock(&dsa_switch_drivers_mutex); list_for_each(list, &dsa_switch_drivers) { - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; - drv = list_entry(list, struct dsa_switch_driver, list); + ops = list_entry(list, struct dsa_switch_ops, list); - name = drv->probe(parent, host_dev, sw_addr, priv); + name = ops->probe(parent, host_dev, sw_addr, priv); if (name != NULL) { - ret = drv; + ret = ops; break; } } @@ -117,7 +117,7 @@ static ssize_t temp1_input_show(struct device *dev, struct dsa_switch *ds = dev_get_drvdata(dev); int temp, ret; - ret = ds->drv->get_temp(ds, &temp); + ret = ds->ops->get_temp(ds, &temp); if (ret < 0) return ret; @@ -131,7 +131,7 @@ static ssize_t temp1_max_show(struct device *dev, struct dsa_switch *ds = dev_get_drvdata(dev); int temp, ret; - ret = ds->drv->get_temp_limit(ds, &temp); + ret = ds->ops->get_temp_limit(ds, &temp); if (ret < 0) return ret; @@ -149,7 +149,7 @@ static ssize_t temp1_max_store(struct device *dev, if (ret < 0) return ret; - ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); + ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); if (ret < 0) return ret; @@ -164,7 +164,7 @@ static ssize_t temp1_max_alarm_show(struct device *dev, bool alarm; int ret; - ret = ds->drv->get_temp_alarm(ds, &alarm); + ret = ds->ops->get_temp_alarm(ds, &alarm); if (ret < 0) return ret; @@ -184,15 +184,15 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct dsa_switch *ds = dev_get_drvdata(dev); - struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_ops *ops = ds->ops; umode_t mode = attr->mode; if (index == 1) { - if (!drv->get_temp_limit) + if (!ops->get_temp_limit) mode = 0; - else if (!drv->set_temp_limit) + else if (!ops->set_temp_limit) mode &= ~S_IWUSR; - } else if (index == 2 && !drv->get_temp_alarm) { + } else if (index == 2 && !ops->get_temp_alarm) { mode = 0; } return mode; @@ -228,8 +228,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, genphy_config_init(phydev); genphy_read_status(phydev); - if (ds->drv->adjust_link) - ds->drv->adjust_link(ds, port, phydev); + if (ds->ops->adjust_link) + ds->ops->adjust_link(ds, port, phydev); } return 0; @@ -303,7 +303,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { - struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_ops *ops = ds->ops; struct dsa_switch_tree *dst = ds->dst; struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; @@ -356,7 +356,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (dst->cpu_switch == index) { enum dsa_tag_protocol tag_protocol; - tag_protocol = drv->get_tag_protocol(ds); + tag_protocol = ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { ret = PTR_ERR(dst->tag_ops); @@ -371,15 +371,15 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) /* * Do basic register setup. */ - ret = drv->setup(ds); + ret = ops->setup(ds); if (ret < 0) goto out; - ret = drv->set_addr(ds, dst->master_netdev->dev_addr); + ret = ops->set_addr(ds, dst->master_netdev->dev_addr); if (ret < 0) goto out; - if (!ds->slave_mii_bus && drv->phy_read) { + if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(parent); if (!ds->slave_mii_bus) { ret = -ENOMEM; @@ -426,7 +426,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * register with hardware monitoring subsystem. * Treat registration error as non-fatal and ignore it. */ - if (drv->get_temp) { + if (ops->get_temp) { const char *netname = netdev_name(dst->master_netdev); char hname[IFNAMSIZ + 1]; int i, j; @@ -457,7 +457,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { struct dsa_chip_data *cd = dst->pd->chip + index; - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; struct dsa_switch *ds; int ret; const char *name; @@ -466,8 +466,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); - if (drv == NULL) { + ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); + if (!ops) { netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", index); return ERR_PTR(-EINVAL); @@ -486,7 +486,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->dst = dst; ds->index = index; ds->cd = cd; - ds->drv = drv; + ds->ops = ops; ds->priv = priv; ds->dev = parent; @@ -541,7 +541,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) ds->dsa_port_mask |= ~(1 << port); } - if (ds->slave_mii_bus && ds->drv->phy_read) + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); } @@ -560,8 +560,8 @@ int dsa_switch_suspend(struct dsa_switch *ds) return ret; } - if (ds->drv->suspend) - ret = ds->drv->suspend(ds); + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); return ret; } @@ -571,8 +571,8 @@ int dsa_switch_resume(struct dsa_switch *ds) { int i, ret = 0; - if (ds->drv->resume) - ret = ds->drv->resume(ds); + if (ds->ops->resume) + ret = ds->ops->resume(ds); if (ret) return ret; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 2e343221464c..8278385dcd21 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -294,25 +294,25 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) int err; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus - * driver and before drv->setup() has run, since the switch drivers and + * driver and before ops->setup() has run, since the switch drivers and * the slave MDIO bus driver rely on these values for probing PHY * devices or not */ ds->phys_mii_mask = ds->enabled_port_mask; - err = ds->drv->setup(ds); + err = ds->ops->setup(ds); if (err < 0) return err; - err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); if (err < 0) return err; - err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); if (err < 0) return err; - if (!ds->slave_mii_bus && ds->drv->phy_read) { + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) return -ENOMEM; @@ -374,7 +374,7 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) dsa_user_port_unapply(port, index, ds); } - if (ds->slave_mii_bus && ds->drv->phy_read) + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); } @@ -466,7 +466,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, dst->cpu_port = index; } - tag_protocol = ds->drv->get_tag_protocol(ds); + tag_protocol = ds->ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); @@ -543,7 +543,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) ds->ports[reg].dn = port; - /* Initialize enabled_port_mask now for drv->setup() + /* Initialize enabled_port_mask now for ops->setup() * to have access to a correct value, just like what * net/dsa/dsa.c::dsa_switch_setup_one does. */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fc9196745225..9f6c2a20f6ff 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -28,7 +28,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) - return ds->drv->phy_read(ds, addr, reg); + return ds->ops->phy_read(ds, addr, reg); return 0xffff; } @@ -38,7 +38,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) - return ds->drv->phy_write(ds, addr, reg, val); + return ds->ops->phy_write(ds, addr, reg, val); return 0; } @@ -98,14 +98,14 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - if (ds->drv->port_enable) { - err = ds->drv->port_enable(ds, p->port, p->phy); + if (ds->ops->port_enable) { + err = ds->ops->port_enable(ds, p->port, p->phy); if (err) goto clear_promisc; } - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, stp_state); + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, p->port, stp_state); if (p->phy) phy_start(p->phy); @@ -144,11 +144,11 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); - if (ds->drv->port_disable) - ds->drv->port_disable(ds, p->port, p->phy); + if (ds->ops->port_disable) + ds->ops->port_disable(ds, p->port, p->phy); - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); return 0; } @@ -209,13 +209,13 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) { - if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) return -EOPNOTSUPP; - return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); + return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans); } - ds->drv->port_vlan_add(ds, p->port, vlan, trans); + ds->ops->port_vlan_add(ds, p->port, vlan, trans); return 0; } @@ -226,10 +226,10 @@ static int dsa_slave_port_vlan_del(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (!ds->drv->port_vlan_del) + if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; - return ds->drv->port_vlan_del(ds, p->port, vlan); + return ds->ops->port_vlan_del(ds, p->port, vlan); } static int dsa_slave_port_vlan_dump(struct net_device *dev, @@ -239,8 +239,8 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->port_vlan_dump) - return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); + if (ds->ops->port_vlan_dump) + return ds->ops->port_vlan_dump(ds, p->port, vlan, cb); return -EOPNOTSUPP; } @@ -253,13 +253,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) { - if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) + if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); + return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans); } - ds->drv->port_fdb_add(ds, p->port, fdb, trans); + ds->ops->port_fdb_add(ds, p->port, fdb, trans); return 0; } @@ -271,8 +271,8 @@ static int dsa_slave_port_fdb_del(struct net_device *dev, struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->port_fdb_del) - ret = ds->drv->port_fdb_del(ds, p->port, fdb); + if (ds->ops->port_fdb_del) + ret = ds->ops->port_fdb_del(ds, p->port, fdb); return ret; } @@ -284,8 +284,8 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->port_fdb_dump) - return ds->drv->port_fdb_dump(ds, p->port, fdb, cb); + if (ds->ops->port_fdb_dump) + return ds->ops->port_fdb_dump(ds, p->port, fdb, cb); return -EOPNOTSUPP; } @@ -308,9 +308,9 @@ static int dsa_slave_stp_state_set(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) - return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP; + return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state); + ds->ops->port_stp_state_set(ds, p->port, attr->u.stp_state); return 0; } @@ -326,8 +326,8 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, if (switchdev_trans_ph_prepare(trans)) return 0; - if (ds->drv->port_vlan_filtering) - return ds->drv->port_vlan_filtering(ds, p->port, + if (ds->ops->port_vlan_filtering) + return ds->ops->port_vlan_filtering(ds, p->port, attr->u.vlan_filtering); return 0; @@ -365,8 +365,8 @@ static int dsa_slave_ageing_time(struct net_device *dev, ds->ports[p->port].ageing_time = ageing_time; ageing_time = dsa_fastest_ageing_time(ds, ageing_time); - if (ds->drv->set_ageing_time) - return ds->drv->set_ageing_time(ds, ageing_time); + if (ds->ops->set_ageing_time) + return ds->ops->set_ageing_time(ds, ageing_time); return 0; } @@ -481,8 +481,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, p->bridge_dev = br; - if (ds->drv->port_bridge_join) - ret = ds->drv->port_bridge_join(ds, p->port, br); + if (ds->ops->port_bridge_join) + ret = ds->ops->port_bridge_join(ds, p->port, br); return ret == -EOPNOTSUPP ? 0 : ret; } @@ -493,16 +493,16 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev) struct dsa_switch *ds = p->parent; - if (ds->drv->port_bridge_leave) - ds->drv->port_bridge_leave(ds, p->port); + if (ds->ops->port_bridge_leave) + ds->ops->port_bridge_leave(ds, p->port); p->bridge_dev = NULL; /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, @@ -605,8 +605,8 @@ static int dsa_slave_get_regs_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_regs_len) - return ds->drv->get_regs_len(ds, p->port); + if (ds->ops->get_regs_len) + return ds->ops->get_regs_len(ds, p->port); return -EOPNOTSUPP; } @@ -617,8 +617,8 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_regs) - ds->drv->get_regs(ds, p->port, regs, _p); + if (ds->ops->get_regs) + ds->ops->get_regs(ds, p->port, regs, _p); } static int dsa_slave_nway_reset(struct net_device *dev) @@ -651,8 +651,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; - if (ds->drv->get_eeprom_len) - return ds->drv->get_eeprom_len(ds); + if (ds->ops->get_eeprom_len) + return ds->ops->get_eeprom_len(ds); return 0; } @@ -663,8 +663,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_eeprom) - return ds->drv->get_eeprom(ds, eeprom, data); + if (ds->ops->get_eeprom) + return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } @@ -675,8 +675,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->set_eeprom) - return ds->drv->set_eeprom(ds, eeprom, data); + if (ds->ops->set_eeprom) + return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } @@ -694,8 +694,8 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + len, "tx_bytes", len); strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); - if (ds->drv->get_strings != NULL) - ds->drv->get_strings(ds, p->port, data + 4 * len); + if (ds->ops->get_strings) + ds->ops->get_strings(ds, p->port, data + 4 * len); } } @@ -714,8 +714,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data); } - if (ds->drv->get_ethtool_stats) - ds->drv->get_ethtool_stats(ds, cpu_port, data + count); + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, cpu_port, data + count); } static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) @@ -727,8 +727,8 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) if (dst->master_ethtool_ops.get_sset_count) count += dst->master_ethtool_ops.get_sset_count(dev, sset); - if (sset == ETH_SS_STATS && ds->drv->get_sset_count) - count += ds->drv->get_sset_count(ds); + if (sset == ETH_SS_STATS && ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); return count; } @@ -755,14 +755,14 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, dst->master_ethtool_ops.get_strings(dev, stringset, data); } - if (stringset == ETH_SS_STATS && ds->drv->get_strings) { + if (stringset == ETH_SS_STATS && ds->ops->get_strings) { ndata = data + mcount * len; /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->drv->get_strings(ds, cpu_port, ndata); - count = ds->drv->get_sset_count(ds); + ds->ops->get_strings(ds, cpu_port, ndata); + count = ds->ops->get_sset_count(ds); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), ndata + i * len, len - sizeof(pfx)); @@ -782,8 +782,8 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, data[1] = dev->stats.tx_bytes; data[2] = dev->stats.rx_packets; data[3] = dev->stats.rx_bytes; - if (ds->drv->get_ethtool_stats != NULL) - ds->drv->get_ethtool_stats(ds, p->port, data + 4); + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, p->port, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) @@ -795,8 +795,8 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) int count; count = 4; - if (ds->drv->get_sset_count != NULL) - count += ds->drv->get_sset_count(ds); + if (ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); return count; } @@ -809,8 +809,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_wol) - ds->drv->get_wol(ds, p->port, w); + if (ds->ops->get_wol) + ds->ops->get_wol(ds, p->port, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) @@ -819,8 +819,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->set_wol) - ret = ds->drv->set_wol(ds, p->port, w); + if (ds->ops->set_wol) + ret = ds->ops->set_wol(ds, p->port, w); return ret; } @@ -831,10 +831,10 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) struct dsa_switch *ds = p->parent; int ret; - if (!ds->drv->set_eee) + if (!ds->ops->set_eee) return -EOPNOTSUPP; - ret = ds->drv->set_eee(ds, p->port, p->phy, e); + ret = ds->ops->set_eee(ds, p->port, p->phy, e); if (ret) return ret; @@ -850,10 +850,10 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) struct dsa_switch *ds = p->parent; int ret; - if (!ds->drv->get_eee) + if (!ds->ops->get_eee) return -EOPNOTSUPP; - ret = ds->drv->get_eee(ds, p->port, e); + ret = ds->ops->get_eee(ds, p->port, e); if (ret) return ret; @@ -988,8 +988,8 @@ static void dsa_slave_adjust_link(struct net_device *dev) p->old_pause = p->phy->pause; } - if (ds->drv->adjust_link && status_changed) - ds->drv->adjust_link(ds, p->port, p->phy); + if (ds->ops->adjust_link && status_changed) + ds->ops->adjust_link(ds, p->port, p->phy); if (status_changed) phy_print_status(p->phy); @@ -1004,8 +1004,8 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, if (dev) { p = netdev_priv(dev); ds = p->parent; - if (ds->drv->fixed_link_update) - ds->drv->fixed_link_update(ds, p->port, status); + if (ds->ops->fixed_link_update) + ds->ops->fixed_link_update(ds, p->port, status); } return 0; @@ -1062,8 +1062,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_dn = port_dn; } - if (ds->drv->get_phy_flags) - phy_flags = ds->drv->get_phy_flags(ds, p->port); + if (ds->ops->get_phy_flags) + phy_flags = ds->ops->get_phy_flags(ds, p->port); if (phy_dn) { int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); -- cgit v1.2.3 From a52e95abf772b43c9226e9a72d3c1353903ba96f Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 24 Aug 2016 15:46:26 +0900 Subject: net: diag: allow socket bytecode filters to match socket marks This allows a privileged process to filter by socket mark when dumping sockets via INET_DIAG_BY_FAMILY. This is useful on systems that use mark-based routing such as Android. The ability to filter socket marks requires CAP_NET_ADMIN, which is consistent with other privileged operations allowed by the SOCK_DIAG interface such as the ability to destroy sockets and the ability to inspect BPF filters attached to packet sockets. Tested: https://android-review.googlesource.com/261350 Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 6 ++++++ net/ipv4/inet_diag.c | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index abbd1dc5d683..5581206a08ae 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -73,6 +73,7 @@ enum { INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, INET_DIAG_BC_DEV_COND, /* u32 ifindex */ + INET_DIAG_BC_MARK_COND, }; struct inet_diag_hostcond { @@ -82,6 +83,11 @@ struct inet_diag_hostcond { __be32 addr[0]; }; +struct inet_diag_markcond { + __u32 mark; + __u32 mask; +}; + /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ struct inet_diag_msg { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a89b68c6a934..abfbe492ebfe 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -45,6 +45,7 @@ struct inet_diag_entry { u16 family; u16 userlocks; u32 ifindex; + u32 mark; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -580,6 +581,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_MARK_COND: { + struct inet_diag_markcond *cond; + + cond = (struct inet_diag_markcond *)(op + 1); + if ((entry->mark & cond->mask) != cond->mark) + yes = 0; + break; + } } if (yes) { @@ -624,6 +633,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; + if (sk_fullsock(sk)) + entry.mark = sk->sk_mark; + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else + entry.mark = 0; return inet_diag_bc_run(bc, &entry); } @@ -706,8 +721,17 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const struct nlattr *attr) +static bool valid_markcond(const struct inet_diag_bc_op *op, int len, + int *min_len) { + *min_len += sizeof(struct inet_diag_markcond); + return len >= *min_len; +} + +static int inet_diag_bc_audit(const struct nlattr *attr, + const struct sk_buff *skb) +{ + bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); const void *bytecode, *bc; int bytecode_len, len; @@ -738,6 +762,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr) if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_MARK_COND: + if (!net_admin) + return -EPERM; + if (!valid_markcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: @@ -1030,7 +1060,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr); + err = inet_diag_bc_audit(attr, skb); if (err) return err; } @@ -1061,7 +1091,7 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr); + err = inet_diag_bc_audit(attr, skb); if (err) return err; } -- cgit v1.2.3 From 72145a68e4ee116533df49af4b87aca0aacc179c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Aug 2016 09:01:23 -0700 Subject: tcp: md5: add LINUX_MIB_TCPMD5FAILURE counter Adds SNMP counter for drops caused by MD5 mismatches. The current syslog might help, but a counter is more precise and helps monitoring. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 25a9ad8bcef1..e7a31f830690 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -235,6 +235,7 @@ enum LINUX_MIB_TCPSPURIOUSRTOS, /* TCPSpuriousRTOs */ LINUX_MIB_TCPMD5NOTFOUND, /* TCPMD5NotFound */ LINUX_MIB_TCPMD5UNEXPECTED, /* TCPMD5Unexpected */ + LINUX_MIB_TCPMD5FAILURE, /* TCPMD5Failure */ LINUX_MIB_SACKSHIFTED, LINUX_MIB_SACKMERGED, LINUX_MIB_SACKSHIFTFALLBACK, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9f665b63a927..1ed015e4bc79 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -257,6 +257,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), + SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 436d978c6c39..ad41e8ecf796 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1169,6 +1169,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ac0ed7bda406..e4f55683af31 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -671,6 +671,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), -- cgit v1.2.3 From eb60a8ddf3c38959cc73821bec5335bed85e0200 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Aug 2016 10:23:34 -0700 Subject: net: minor optimization in qdisc_qstats_cpu_drop() per_cpu_inc() is faster (at least on x86) than per_cpu_ptr(xxx)++; Signed-off-by: Eric Dumazet Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0d501779cc68..52a2015667b4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -592,7 +592,7 @@ static inline void qdisc_qstats_drop(struct Qdisc *sch) static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) { - qstats_drop_inc(this_cpu_ptr(sch->cpu_qstats)); + this_cpu_inc(sch->cpu_qstats->drops); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) -- cgit v1.2.3 From 5ca8cc5bf11faed257c762018aea9106d529232f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Aug 2016 12:31:31 +0200 Subject: rhashtable: add rhashtable_lookup_get_insert_key() This patch modifies __rhashtable_insert_fast() so it returns the existing object that clashes with the one that you want to insert. In case the object is successfully inserted, NULL is returned. Otherwise, you get an error via ERR_PTR(). This patch adapts the existing callers of __rhashtable_insert_fast() so they handle this new logic, and it adds a new rhashtable_lookup_get_insert_key() interface to fetch this existing object. nf_tables needs this change to improve handling of EEXIST cases via honoring the NLM_F_EXCL flag and by checking if the data part of the mapping matches what we have. Cc: Herbert Xu Cc: Thomas Graf Signed-off-by: Pablo Neira Ayuso Acked-by: Herbert Xu --- include/linux/rhashtable.h | 70 +++++++++++++++++++++++++++++++++++++--------- lib/rhashtable.c | 10 +++++-- 2 files changed, 64 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3eef0802a0cd..26b7a059c65e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *ht, struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj, - struct bucket_table *old_tbl); + struct bucket_table *old_tbl, + void **data); int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, @@ -563,8 +564,11 @@ restart: return NULL; } -/* Internal function, please use rhashtable_insert_fast() instead */ -static inline int __rhashtable_insert_fast( +/* Internal function, please use rhashtable_insert_fast() instead. This + * function returns the existing element already in hashes in there is a clash, + * otherwise it returns an error via ERR_PTR(). + */ +static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { @@ -577,6 +581,7 @@ static inline int __rhashtable_insert_fast( spinlock_t *lock; unsigned int elasticity; unsigned int hash; + void *data = NULL; int err; restart: @@ -601,11 +606,14 @@ restart: new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(new_tbl)) { - tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); + tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data); if (!IS_ERR_OR_NULL(tbl)) goto slow_path; err = PTR_ERR(tbl); + if (err == -EEXIST) + err = 0; + goto out; } @@ -619,25 +627,25 @@ slow_path: err = rhashtable_insert_rehash(ht, tbl); rcu_read_unlock(); if (err) - return err; + return ERR_PTR(err); goto restart; } - err = -EEXIST; + err = 0; elasticity = ht->elasticity; rht_for_each(head, tbl, hash) { if (key && unlikely(!(params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head))))) + rhashtable_compare(&arg, rht_obj(ht, head))))) { + data = rht_obj(ht, head); goto out; + } if (!--elasticity) goto slow_path; } - err = 0; - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); @@ -652,7 +660,7 @@ out: spin_unlock_bh(lock); rcu_read_unlock(); - return err; + return err ? ERR_PTR(err) : data; } /** @@ -675,7 +683,13 @@ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { - return __rhashtable_insert_fast(ht, NULL, obj, params); + void *ret; + + ret = __rhashtable_insert_fast(ht, NULL, obj, params); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; } /** @@ -704,11 +718,15 @@ static inline int rhashtable_lookup_insert_fast( const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); + void *ret; BUG_ON(ht->p.obj_hashfn); - return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, - params); + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; } /** @@ -736,6 +754,32 @@ static inline int rhashtable_lookup_insert_fast( static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) +{ + void *ret; + + BUG_ON(!ht->p.obj_hashfn || !key); + + ret = __rhashtable_insert_fast(ht, key, obj, params); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhashtable_lookup_get_insert_key - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * @data: pointer to element data already in hashes + * + * Just like rhashtable_lookup_insert_key(), but this function returns the + * object if it exists, NULL if it does not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 5d845ffd7982..7a940d92f17e 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -438,7 +438,8 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj, - struct bucket_table *tbl) + struct bucket_table *tbl, + void **data) { struct rhash_head *head; unsigned int hash; @@ -449,8 +450,11 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + if (key) { + *data = rhashtable_lookup_fast(ht, key, ht->p); + if (*data) + goto exit; + } err = -E2BIG; if (unlikely(rht_grow_above_max(ht, tbl))) -- cgit v1.2.3 From c016c7e45ddfa5085b35b644e659ec014969740d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Aug 2016 12:41:54 +0200 Subject: netfilter: nf_tables: honor NLM_F_EXCL flag in set element insertion If the NLM_F_EXCL flag is set, then new elements that clash with an existing one return EEXIST. In case you try to add an element whose data area differs from what we have, then this returns EBUSY. If no flag is specified at all, then this returns success to userspace. This patch also update the set insert operation so we can fetch the existing element that clashes with the one you want to add, we need this to make sure the element data doesn't differ. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 20 +++++++++++++++----- net/netfilter/nft_set_hash.c | 17 +++++++++++++---- net/netfilter/nft_set_rbtree.c | 12 ++++++++---- 4 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f2f13399ce44..8972468bc94b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -251,7 +251,8 @@ struct nft_set_ops { int (*insert)(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem); + const struct nft_set_elem *elem, + struct nft_set_ext **ext); void (*activate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 221d27f09623..bd9715e5ff26 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3483,12 +3483,12 @@ static int nft_setelem_parse_flags(const struct nft_set *set, } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, u32 nlmsg_flags) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; - struct nft_set_ext *ext; + struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_userdata *udata; @@ -3615,9 +3615,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err4; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(ctx->net, set, &elem); - if (err < 0) + err = set->ops->insert(ctx->net, set, &elem, &ext2); + if (err) { + if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && + memcmp(nft_set_ext_data(ext), + nft_set_ext_data(ext2), set->dlen) != 0) + err = -EBUSY; + else if (!(nlmsg_flags & NLM_F_EXCL)) + err = 0; + } goto err5; + } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -3673,7 +3683,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) return -ENFILE; - err = nft_add_set_elem(&ctx, set, attr); + err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); if (err < 0) { atomic_dec(&set->nelems); break; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 564fa7929ed5..3794cb2fc788 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -126,7 +126,8 @@ err1: } static int nft_hash_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he = elem->priv; @@ -135,9 +136,17 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, .set = set, .key = elem->key.val.data, }; - - return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params); + struct nft_hash_elem *prev; + + prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params); + if (IS_ERR(prev)) + return PTR_ERR(prev); + if (prev) { + *ext = &prev->ext; + return -EEXIST; + } + return 0; } static void nft_hash_activate(const struct net *net, const struct nft_set *set, diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 6473936d05c6..038682d48261 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -94,7 +94,8 @@ out: } static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, - struct nft_rbtree_elem *new) + struct nft_rbtree_elem *new, + struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); @@ -122,8 +123,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, else if (!nft_rbtree_interval_end(rbe) && nft_rbtree_interval_end(new)) p = &parent->rb_right; - else + else { + *ext = &rbe->ext; return -EEXIST; + } } } } @@ -133,13 +136,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + struct nft_set_ext **ext) { struct nft_rbtree_elem *rbe = elem->priv; int err; spin_lock_bh(&nft_rbtree_lock); - err = __nft_rbtree_insert(net, set, rbe); + err = __nft_rbtree_insert(net, set, rbe, ext); spin_unlock_bh(&nft_rbtree_lock); return err; -- cgit v1.2.3 From 2a313cdf1e6e4cc8cc3f16f976e1abfbdd0626fa Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 25 Aug 2016 16:46:44 +0200 Subject: devlink: remove unused priv_size Remove unused and useless priv_size member from struct devlink_ops. Cc: Jiri Pirko Signed-off-by: Ivan Vecera Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index c99ffe8cef3c..211bd3c37028 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -50,7 +50,6 @@ struct devlink_sb_pool_info { }; struct devlink_ops { - size_t priv_size; int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, -- cgit v1.2.3 From 6bc506b4fb065eac3d89ca1ce37082e174493d9e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 25 Aug 2016 18:42:37 +0200 Subject: bridge: switchdev: Add forward mark support for stacked devices switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of port netdevs so that packets being flooded by the device won't be flooded twice. It works by assigning a unique identifier (the ifindex of the first bridge port) to bridge ports sharing the same parent ID. This prevents packets from being flooded twice by the same switch, but will flood packets through bridge ports belonging to a different switch. This method is problematic when stacked devices are taken into account, such as VLANs. In such cases, a physical port netdev can have upper devices being members in two different bridges, thus requiring two different 'offload_fwd_mark's to be configured on the port netdev, which is impossible. The main problem is that packet and netdev marking is performed at the physical netdev level, whereas flooding occurs between bridge ports, which are not necessarily port netdevs. Instead, packet and netdev marking should really be done in the bridge driver with the switch driver only telling it which packets it already forwarded. The bridge driver will mark such packets using the mark assigned to the ingress bridge port and will prevent the packet from being forwarded through any bridge port sharing the same mark (i.e. having the same parent ID). Remove the current switchdev 'offload_fwd_mark' implementation and instead implement the proposed method. In addition, make rocker - the sole user of the mark - use the proposed method. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 13 ++--- drivers/net/ethernet/rocker/rocker_main.c | 2 +- drivers/net/ethernet/rocker/rocker_ofdpa.c | 4 -- include/linux/netdevice.h | 5 -- include/linux/skbuff.h | 13 ++--- include/net/switchdev.h | 6 --- net/bridge/Makefile | 2 + net/bridge/br_forward.c | 3 +- net/bridge/br_if.c | 10 ++-- net/bridge/br_input.c | 2 + net/bridge/br_private.h | 37 +++++++++++++ net/bridge/br_switchdev.c | 57 ++++++++++++++++++++ net/core/dev.c | 10 ---- net/switchdev/switchdev.c | 85 ------------------------------ 14 files changed, 117 insertions(+), 132 deletions(-) create mode 100644 net/bridge/br_switchdev.c (limited to 'include') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 31c39115834d..44235e83799b 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -283,15 +283,10 @@ be sent to the port netdev for processing by the bridge driver. The bridge should not reflood the packet to the same ports the device flooded, otherwise there will be duplicate packets on the wire. -To avoid duplicate packets, the device/driver should mark a packet as already -forwarded using skb->offload_fwd_mark. The same mark is set on the device -ports in the domain using dev->offload_fwd_mark. If the skb->offload_fwd_mark -is non-zero and matches the forwarding egress port's dev->skb_mark, the kernel -will drop the skb right before transmit on the egress port, with the -understanding that the device already forwarded the packet on same egress port. -The driver can use switchdev_port_fwd_mark_set() to set a globally unique mark -for port's dev->offload_fwd_mark, based on the port's parent ID (switch ID) and -a group ifindex. +To avoid duplicate packets, the switch driver should mark a packet as already +forwarded by setting the skb->offload_fwd_mark bit. The bridge driver will mark +the skb using the ingress bridge port's mark and prevent it from being forwarded +through any bridge port with the same mark. It is possible for the switch device to not handle flooding and push the packets up to the bridge driver for flooding. This is not ideal as the number diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index f0b09b05ed3f..1f0c08602eba 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2412,7 +2412,7 @@ static int rocker_port_rx_proc(const struct rocker *rocker, skb->protocol = eth_type_trans(skb, rocker_port->dev); if (rx_flags & ROCKER_RX_FLAGS_FWD_OFFLOAD) - skb->offload_fwd_mark = rocker_port->dev->offload_fwd_mark; + skb->offload_fwd_mark = 1; rocker_port->dev->stats.rx_packets++; rocker_port->dev->stats.rx_bytes += skb->len; diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 1ca796316173..fcad907baecf 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -2558,7 +2558,6 @@ static int ofdpa_port_init(struct rocker_port *rocker_port) struct ofdpa_port *ofdpa_port = rocker_port->wpriv; int err; - switchdev_port_fwd_mark_set(ofdpa_port->dev, NULL, false); rocker_port_set_learning(rocker_port, !!(ofdpa_port->brport_flags & BR_LEARNING)); @@ -2817,7 +2816,6 @@ static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port, ofdpa_port_internal_vlan_id_get(ofdpa_port, bridge->ifindex); ofdpa_port->bridge_dev = bridge; - switchdev_port_fwd_mark_set(ofdpa_port->dev, bridge, true); return ofdpa_port_vlan_add(ofdpa_port, NULL, OFDPA_UNTAGGED_VID, 0); } @@ -2836,8 +2834,6 @@ static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port) ofdpa_port_internal_vlan_id_get(ofdpa_port, ofdpa_port->dev->ifindex); - switchdev_port_fwd_mark_set(ofdpa_port->dev, ofdpa_port->bridge_dev, - false); ofdpa_port->bridge_dev = NULL; err = ofdpa_port_vlan_add(ofdpa_port, NULL, OFDPA_UNTAGGED_VID, 0); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 794bb0733799..d122be9345c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1562,8 +1562,6 @@ enum netdev_priv_flags { * * @xps_maps: XXX: need comments on this one * - * @offload_fwd_mark: Offload device fwding mark - * * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers @@ -1814,9 +1812,6 @@ struct net_device { #ifdef CONFIG_NET_CLS_ACT struct tcf_proto __rcu *egress_cl_list; #endif -#ifdef CONFIG_NET_SWITCHDEV - u32 offload_fwd_mark; -#endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7047448e8129..cfb7219be665 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -612,7 +612,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1, * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking - * @offload_fwd_mark: fwding offload mark * @mark: Generic packet mark * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information @@ -730,7 +729,10 @@ struct sk_buff { __u8 ipvs_property:1; __u8 inner_protocol_type:1; __u8 remcsum_offload:1; - /* 3 or 5 bit hole */ +#ifdef CONFIG_NET_SWITCHDEV + __u8 offload_fwd_mark:1; +#endif + /* 2, 4 or 5 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -757,14 +759,9 @@ struct sk_buff { unsigned int sender_cpu; }; #endif - union { #ifdef CONFIG_NETWORK_SECMARK - __u32 secmark; + __u32 secmark; #endif -#ifdef CONFIG_NET_SWITCHDEV - __u32 offload_fwd_mark; -#endif - }; union { __u32 mark; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 62f6a967a1b7..82f5e0462021 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -347,12 +347,6 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, return idx; } -static inline void switchdev_port_fwd_mark_set(struct net_device *dev, - struct net_device *group_dev, - bool joining) -{ -} - static inline bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { diff --git a/net/bridge/Makefile b/net/bridge/Makefile index a1cda5d4718d..0aefc011b668 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,4 +20,6 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o +bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o + obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 63a83d8d7da3..32a02de39cd2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -29,7 +29,8 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; + br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && + nbp_switchdev_allowed_egress(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f2fede05d32c..1da3221845f1 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -545,6 +545,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err5; + err = nbp_switchdev_mark_set(p); + if (err) + goto err6; + dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -566,7 +570,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) err = nbp_vlan_init(p); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); - goto err6; + goto err7; } spin_lock_bh(&br->lock); @@ -589,12 +593,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; -err6: +err7: list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); +err6: netdev_upper_dev_unlink(dev, br->dev); - err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8e486203d133..3132cfc80e9d 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -145,6 +145,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) goto out; + nbp_switchdev_frame_mark(p, skb); + /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index aac2a6e6b008..2379b2b865c9 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -251,6 +251,9 @@ struct net_bridge_port #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) @@ -359,6 +362,11 @@ struct net_bridge struct timer_list gc_timer; struct kobject *ifobj; u32 auto_cnt; + +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif + #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; u8 vlan_enabled; @@ -381,6 +389,10 @@ struct br_input_skb_cb { #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; #endif + +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -1034,4 +1046,29 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; } static inline void br_sysfs_delbr(struct net_device *dev) { return; } #endif /* CONFIG_SYSFS */ +/* br_switchdev.c */ +#ifdef CONFIG_NET_SWITCHDEV +int nbp_switchdev_mark_set(struct net_bridge_port *p); +void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb); +bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb); +#else +static inline int nbp_switchdev_mark_set(struct net_bridge_port *p) +{ + return 0; +} + +static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return true; +} +#endif /* CONFIG_NET_SWITCHDEV */ + #endif diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c new file mode 100644 index 000000000000..f4097b900de1 --- /dev/null +++ b/net/bridge/br_switchdev.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p; + + /* dev is yet to be added to the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (switchdev_port_same_parent_id(dev, p->dev)) + return p->offload_fwd_mark; + } + + return ++br->offload_fwd_mark; +} + +int nbp_switchdev_mark_set(struct net_bridge_port *p) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; + int err; + + ASSERT_RTNL(); + + err = switchdev_port_attr_get(p->dev, &attr); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev); + + return 0; +} + +void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark)) + BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark; +} + +bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return !skb->offload_fwd_mark || + BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark; +} diff --git a/net/core/dev.c b/net/core/dev.c index 7feae74ca928..1d5c6dda1988 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3355,16 +3355,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); -#ifdef CONFIG_NET_SWITCHDEV - /* Don't forward if offload device already forwarded */ - if (skb->offload_fwd_mark && - skb->offload_fwd_mark == dev->offload_fwd_mark) { - consume_skb(skb); - rc = NET_XMIT_SUCCESS; - goto out; - } -#endif - txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 2c683f24d557..1031a0327fff 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1305,88 +1305,3 @@ bool switchdev_port_same_parent_id(struct net_device *a, return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); } EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); - -static u32 switchdev_port_fwd_mark_get(struct net_device *dev, - struct net_device *group_dev) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(group_dev, lower_dev, iter) { - if (lower_dev == dev) - continue; - if (switchdev_port_same_parent_id(dev, lower_dev)) - return lower_dev->offload_fwd_mark; - return switchdev_port_fwd_mark_get(dev, lower_dev); - } - - return dev->ifindex; -} - -static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, - u32 old_mark, u32 *reset_mark) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(group_dev, lower_dev, iter) { - if (lower_dev->offload_fwd_mark == old_mark) { - if (!*reset_mark) - *reset_mark = lower_dev->ifindex; - lower_dev->offload_fwd_mark = *reset_mark; - } - switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); - } -} - -/** - * switchdev_port_fwd_mark_set - Set port offload forwarding mark - * - * @dev: port device - * @group_dev: containing device - * @joining: true if dev is joining group; false if leaving group - * - * An ungrouped port's offload mark is just its ifindex. A grouped - * port's (member of a bridge, for example) offload mark is the ifindex - * of one of the ports in the group with the same parent (switch) ID. - * Ports on the same device in the same group will have the same mark. - * - * Example: - * - * br0 ifindex=9 - * sw1p1 ifindex=2 mark=2 - * sw1p2 ifindex=3 mark=2 - * sw2p1 ifindex=4 mark=5 - * sw2p2 ifindex=5 mark=5 - * - * If sw2p2 leaves the bridge, we'll have: - * - * br0 ifindex=9 - * sw1p1 ifindex=2 mark=2 - * sw1p2 ifindex=3 mark=2 - * sw2p1 ifindex=4 mark=4 - * sw2p2 ifindex=5 mark=5 - */ -void switchdev_port_fwd_mark_set(struct net_device *dev, - struct net_device *group_dev, - bool joining) -{ - u32 mark = dev->ifindex; - u32 reset_mark = 0; - - if (group_dev) { - ASSERT_RTNL(); - if (joining) - mark = switchdev_port_fwd_mark_get(dev, group_dev); - else if (dev->offload_fwd_mark == mark) - /* Ohoh, this port was the mark reference port, - * but it's leaving the group, so reset the - * mark for the remaining ports in the group. - */ - switchdev_port_fwd_mark_reset(group_dev, mark, - &reset_mark); - } - - dev->offload_fwd_mark = mark; -} -EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); -- cgit v1.2.3 From ef20cd4dd1633987bcf46ac34ace2c8af212361f Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Fri, 26 Aug 2016 10:52:53 +0200 Subject: tipc: introduce UDP replicast This patch introduces UDP replicast. A concept where we emulate multicast by sending multiple unicast messages to configured peers. The purpose of replicast is mainly to be able to use TIPC in cloud environments where IP multicast is disabled. Using replicas to unicast multicast messages is costly as we have to copy each skb and send the copies individually. Signed-off-by: Richard Alpe Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/bearer.c | 44 ++++++++++++++ net/tipc/bearer.h | 1 + net/tipc/netlink.c | 5 ++ net/tipc/udp_media.c | 118 ++++++++++++++++++++++++++++++++++---- net/tipc/udp_media.h | 44 ++++++++++++++ 6 files changed, 201 insertions(+), 12 deletions(-) create mode 100644 net/tipc/udp_media.h (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index bcb65ef725f6..b15664c36219 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -60,6 +60,7 @@ enum { TIPC_NL_MON_GET, TIPC_NL_MON_PEER_GET, TIPC_NL_PEER_REMOVE, + TIPC_NL_BEARER_ADD, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 28056fa8f77a..d7b442dd6669 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -42,6 +42,7 @@ #include "monitor.h" #include "bcast.h" #include "netlink.h" +#include "udp_media.h" #define MAX_ADDR_STR 60 @@ -897,6 +898,49 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) return 0; } +int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_bearer *b; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + struct net *net = sock_net(skb->sk); + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + b = tipc_bearer_find(net, name); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + +#ifdef CONFIG_TIPC_MEDIA_UDP + if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { + err = tipc_udp_nl_bearer_add(b, + attrs[TIPC_NLA_BEARER_UDP_OPTS]); + if (err) { + rtnl_unlock(); + return err; + } + } +#endif + rtnl_unlock(); + + return 0; +} + int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { int err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 83a9abbfe32c..78892e2f53e3 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -181,6 +181,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 2718de667828..3122f21ca979 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -160,6 +160,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .dumpit = tipc_nl_bearer_dump, .policy = tipc_nl_policy, }, + { + .cmd = TIPC_NL_BEARER_ADD, + .doit = tipc_nl_bearer_add, + .policy = tipc_nl_policy, + }, { .cmd = TIPC_NL_BEARER_SET, .doit = tipc_nl_bearer_set, diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index b8ec1a18241e..6b938cc15daf 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -49,6 +49,7 @@ #include "core.h" #include "bearer.h" #include "netlink.h" +#include "msg.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 @@ -70,6 +71,13 @@ struct udp_media_addr { }; }; +/* struct udp_replicast - container for UDP remote addresses */ +struct udp_replicast { + struct udp_media_addr addr; + struct rcu_head rcu; + struct list_head list; +}; + /** * struct udp_bearer - ip/udp bearer data structure * @bearer: associated generic tipc bearer @@ -82,6 +90,7 @@ struct udp_bearer { struct socket *ubsock; u32 ifindex; struct work_struct work; + struct udp_replicast rcast; }; static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr) @@ -203,29 +212,75 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value; + struct udp_replicast *rcast; struct udp_bearer *ub; int err = 0; if (skb_headroom(skb) < UDP_MIN_HEADROOM) { err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); if (err) - goto tx_error; + goto out; } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) { err = -ENODEV; - goto tx_error; + goto out; } - return tipc_udp_xmit(net, skb, ub, src, dst); + if (!addr->broadcast || list_empty(&ub->rcast.list)) + return tipc_udp_xmit(net, skb, ub, src, dst); -tx_error: + /* Replicast, send an skb to each configured IP address */ + list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { + struct sk_buff *_skb; + + _skb = pskb_copy(skb, GFP_ATOMIC); + if (!_skb) { + err = -ENOMEM; + goto out; + } + + err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr); + if (err) { + kfree_skb(_skb); + goto out; + } + } + err = 0; +out: kfree_skb(skb); return err; } +static int tipc_udp_rcast_add(struct tipc_bearer *b, + struct udp_media_addr *addr) +{ + struct udp_replicast *rcast; + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) + return -ENODEV; + + rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); + if (!rcast) + return -ENOMEM; + + memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); + + if (ntohs(addr->proto) == ETH_P_IP) + pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4); +#if IS_ENABLED(CONFIG_IPV6) + else if (ntohs(addr->proto) == ETH_P_IPV6) + pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); +#endif + + list_add_rcu(&rcast->list, &ub->rcast.list); + return 0; +} + /* tipc_udp_recv - read data from bearer socket */ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { @@ -320,6 +375,32 @@ static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr, return -EADDRNOTAVAIL; } +int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) +{ + int err; + struct udp_media_addr addr = {0}; + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + struct udp_media_addr *dst; + + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy)) + return -EINVAL; + + if (!opts[TIPC_NLA_UDP_REMOTE]) + return -EINVAL; + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL); + if (err) + return err; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (tipc_udp_is_mcast_addr(dst)) { + pr_err("Can't add remote ip to TIPC UDP multicast bearer\n"); + return -EINVAL; + } + + return tipc_udp_rcast_add(b, &addr); +} + /** * tipc_udp_enable - callback to create a new udp bearer instance * @net: network namespace @@ -334,7 +415,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, { int err = -EINVAL; struct udp_bearer *ub; - struct udp_media_addr *remote; + struct udp_media_addr remote = {0}; struct udp_media_addr local = {0}; struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; @@ -344,6 +425,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (!ub) return -ENOMEM; + INIT_LIST_HEAD(&ub->rcast.list); + if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; @@ -362,9 +445,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto err; - remote = (struct udp_media_addr *)&b->bcast_addr.value; - memset(remote, 0, sizeof(struct udp_media_addr)); - err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], remote, NULL); + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL); if (err) goto err; @@ -409,10 +490,17 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); - if (tipc_udp_is_mcast_addr(remote)) { - if (enable_mcast(ub, remote)) - goto err; - } + /** + * The bcast media address port is used for all peers and the ip + * is used if it's a multicast address. + */ + memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); + if (tipc_udp_is_mcast_addr(&remote)) + err = enable_mcast(ub, &remote); + else + err = tipc_udp_rcast_add(b, &remote); + if (err) + goto err; return 0; err: @@ -424,6 +512,12 @@ err: static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); + struct udp_replicast *rcast, *tmp; + + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + list_del_rcu(&rcast->list); + kfree_rcu(rcast, rcu); + } if (ub->ubsock) udp_tunnel_sock_release(ub->ubsock); diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h new file mode 100644 index 000000000000..4dcb54880aa6 --- /dev/null +++ b/net/tipc/udp_media.h @@ -0,0 +1,44 @@ +/* + * net/tipc/udp_media.h: Include file for UDP bearer media + * + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef CONFIG_TIPC_MEDIA_UDP +#ifndef _TIPC_UDP_MEDIA_H +#define _TIPC_UDP_MEDIA_H + +int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); + +#endif +#endif -- cgit v1.2.3 From fdb3accc2c15fabc2b687b2819da9167027c61b6 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Fri, 26 Aug 2016 10:52:55 +0200 Subject: tipc: add the ability to get UDP options via netlink Add UDP bearer options to netlink bearer get message. This is used by the tipc user space tool to display UDP options. The UDP bearer information is passed using either a sockaddr_in or sockaddr_in6 structs. This means the user space receiver should intermediately store the retrieved data in a large enough struct (sockaddr_strage) before casting to the proper IP version type. Signed-off-by: Richard Alpe Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 ++ net/tipc/bearer.c | 8 +++++ net/tipc/udp_media.c | 61 +++++++++++++++++++++++++++++++++++++++ net/tipc/udp_media.h | 1 + 4 files changed, 72 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index b15664c36219..f9edd20fe9ba 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -61,6 +61,7 @@ enum { TIPC_NL_MON_PEER_GET, TIPC_NL_PEER_REMOVE, TIPC_NL_BEARER_ADD, + TIPC_NL_UDP_GET_REMOTEIP, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -100,6 +101,7 @@ enum { TIPC_NLA_UDP_UNSPEC, TIPC_NLA_UDP_LOCAL, /* sockaddr_storage */ TIPC_NLA_UDP_REMOTE, /* sockaddr_storage */ + TIPC_NLA_UDP_MULTI_REMOTEIP, /* flag */ __TIPC_NLA_UDP_MAX, TIPC_NLA_UDP_MAX = __TIPC_NLA_UDP_MAX - 1 diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d7b442dd6669..975dbeb60ab0 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -712,6 +712,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; nla_nest_end(msg->skb, prop); + +#ifdef CONFIG_TIPC_MEDIA_UDP + if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) { + if (tipc_udp_nl_add_bearer_data(msg, bearer)) + goto attr_msg_full; + } +#endif + nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 6ece3c9ccf82..a6cdd9895653 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -401,6 +401,67 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) return err; } +static int __tipc_nl_add_udp_addr(struct sk_buff *skb, + struct udp_media_addr *addr, int nla_t) +{ + if (ntohs(addr->proto) == ETH_P_IP) { + struct sockaddr_in ip4; + + ip4.sin_family = AF_INET; + ip4.sin_port = addr->port; + ip4.sin_addr.s_addr = addr->ipv4.s_addr; + if (nla_put(skb, nla_t, sizeof(ip4), &ip4)) + return -EMSGSIZE; + +#if IS_ENABLED(CONFIG_IPV6) + } else if (ntohs(addr->proto) == ETH_P_IPV6) { + struct sockaddr_in6 ip6; + + ip6.sin6_family = AF_INET6; + ip6.sin6_port = addr->port; + memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); + if (nla_put(skb, nla_t, sizeof(ip6), &ip6)) + return -EMSGSIZE; +#endif + } + + return 0; +} + +int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) +{ + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct udp_media_addr *dst; + struct udp_bearer *ub; + struct nlattr *nest; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) + return -ENODEV; + + nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); + if (!nest) + goto msg_full; + + if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL)) + goto msg_full; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE)) + goto msg_full; + + if (!list_empty(&ub->rcast.list)) { + if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP)) + goto msg_full; + } + + nla_nest_end(msg->skb, nest); + return 0; +msg_full: + nla_nest_cancel(msg->skb, nest); + return -EMSGSIZE; +} + /** * tipc_parse_udp_addr - build udp media address from netlink data * @nlattr: netlink attribute containing sockaddr storage aligned address diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h index 4dcb54880aa6..c06326a134db 100644 --- a/net/tipc/udp_media.h +++ b/net/tipc/udp_media.h @@ -39,6 +39,7 @@ #define _TIPC_UDP_MEDIA_H int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); +int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b); #endif #endif -- cgit v1.2.3 From 0294b625ad5a6d1fb50632d67cf384862d8a4a46 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 28 Aug 2016 14:43:17 -0700 Subject: net: Add read_sock proto_op Add new function in proto_ops structure. This includes moving the typedef got sk_read_actor into net.h and removing the definition from tcp.h. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/net.h | 6 ++++++ include/net/tcp.h | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index b9f0ff4d489c..cd0c8bd0a1de 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -128,6 +129,9 @@ struct page; struct sockaddr; struct msghdr; struct module; +struct sk_buff; +typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, + unsigned int, size_t); struct proto_ops { int family; @@ -186,6 +190,8 @@ struct proto_ops { struct pipe_inode_info *pipe, size_t len, unsigned int flags); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); + int (*read_sock)(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); }; #define DECLARE_SOCKADDR(type, dst, src) \ diff --git a/include/net/tcp.h b/include/net/tcp.h index 25d64f6de69e..d56666ad9249 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -603,8 +603,6 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ -typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, - unsigned int, size_t); int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -- cgit v1.2.3 From 3203558589a597e0a10a66b258fbc5a4a6659ed0 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 28 Aug 2016 14:43:18 -0700 Subject: tcp: Set read_sock and peek_len proto_ops In inet_stream_ops we set read_sock to tcp_read_sock and peek_len to tcp_peek_len (which is just a stub function that calls tcp_inq). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp.c | 6 ++++++ net/ipv6/af_inet6.c | 2 ++ 4 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index d56666ad9249..a5af6be3a572 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1848,6 +1848,8 @@ static inline int tcp_inq(struct sock *sk) return answ; } +int tcp_peek_len(struct socket *sock); + static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) { u16 segs_in; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 989a362814a9..e94b47be0019 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -916,6 +916,8 @@ const struct proto_ops inet_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1a9a0a8a1f3..60a438864f32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1570,6 +1570,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); +int tcp_peek_len(struct socket *sock) +{ + return tcp_inq(sock->sk); +} +EXPORT_SYMBOL(tcp_peek_len); + /* * This routine copies from a sock struct into the user buffer. * diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b454055ba625..46ad699937fd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -545,6 +545,8 @@ const struct proto_ops inet6_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, -- cgit v1.2.3 From 96a59083478d1ea66684c59c073424a9d4e6ac6d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 28 Aug 2016 14:43:19 -0700 Subject: kcm: Remove TCP specific references from kcm and strparser kcm and strparser need to work with any type of stream socket not just TCP. Eliminate references to TCP and call generic proto_ops functions of read_sock and peek_len. Also in strp_init check if the socket support the proto_ops read_sock and peek_len. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/strparser.h | 2 +- net/kcm/kcmsock.c | 30 +++++++++++++---------------- net/strparser/strparser.c | 48 ++++++++++++++++++++++++++++------------------- 3 files changed, 43 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index 91fa0b958426..0c28ad97c52f 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -137,6 +137,6 @@ void strp_stop(struct strparser *strp); void strp_check_rcv(struct strparser *strp); int strp_init(struct strparser *strp, struct sock *csk, struct strp_callbacks *cb); -void strp_tcp_data_ready(struct strparser *strp); +void strp_data_ready(struct strparser *strp); #endif /* __NET_STRPARSER_H_ */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index eb731cacc325..2632ac748371 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -26,7 +26,6 @@ #include #include #include -#include #include unsigned int kcm_net_id; @@ -340,7 +339,7 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, } /* Lower sock lock held */ -static void psock_tcp_data_ready(struct sock *sk) +static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; @@ -348,7 +347,7 @@ static void psock_tcp_data_ready(struct sock *sk) psock = (struct kcm_psock *)sk->sk_user_data; if (likely(psock)) - strp_tcp_data_ready(&psock->strp); + strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } @@ -392,7 +391,7 @@ static int kcm_read_sock_done(struct strparser *strp, int err) return err; } -static void psock_tcp_state_change(struct sock *sk) +static void psock_state_change(struct sock *sk) { /* TCP only does a POLLIN for a half close. Do a POLLHUP here * since application will normally not poll with POLLIN @@ -402,7 +401,7 @@ static void psock_tcp_state_change(struct sock *sk) report_csk_error(sk, EPIPE); } -static void psock_tcp_write_space(struct sock *sk) +static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; @@ -1383,19 +1382,12 @@ static int kcm_attach(struct socket *sock, struct socket *csock, struct list_head *head; int index = 0; struct strp_callbacks cb; - - if (csock->ops->family != PF_INET && - csock->ops->family != PF_INET6) - return -EINVAL; + int err; csk = csock->sk; if (!csk) return -EINVAL; - /* Only support TCP for now */ - if (csk->sk_protocol != IPPROTO_TCP) - return -EINVAL; - psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) return -ENOMEM; @@ -1409,7 +1401,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, cb.parse_msg = kcm_parse_func_strparser; cb.read_sock_done = kcm_read_sock_done; - strp_init(&psock->strp, csk, &cb); + err = strp_init(&psock->strp, csk, &cb); + if (err) { + kmem_cache_free(kcm_psockp, psock); + return err; + } sock_hold(csk); @@ -1418,9 +1414,9 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; csk->sk_user_data = psock; - csk->sk_data_ready = psock_tcp_data_ready; - csk->sk_write_space = psock_tcp_write_space; - csk->sk_state_change = psock_tcp_state_change; + csk->sk_data_ready = psock_data_ready; + csk->sk_write_space = psock_write_space; + csk->sk_state_change = psock_state_change; write_unlock_bh(&csk->sk_callback_lock); /* Finished initialization, now add the psock to the MUX. */ diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 4ecfc10cbe6d..5c7549b5b92c 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -26,7 +26,6 @@ #include #include #include -#include static struct workqueue_struct *strp_wq; @@ -80,9 +79,16 @@ static void strp_parser_err(struct strparser *strp, int err, strp->cb.abort_parser(strp, err); } +static inline int strp_peek_len(struct strparser *strp) +{ + struct socket *sock = strp->sk->sk_socket; + + return sock->ops->peek_len(sock); +} + /* Lower socket lock held */ -static int strp_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, - unsigned int orig_offset, size_t orig_len) +static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) { struct strparser *strp = (struct strparser *)desc->arg.data; struct _strp_rx_msg *rxm; @@ -266,12 +272,12 @@ static int strp_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, if (extra < 0) { /* Message not complete yet. */ if (rxm->strp.full_len - rxm->accum_len > - tcp_inq(strp->sk)) { + strp_peek_len(strp)) { /* Don't have the whole messages in the socket * buffer. Set strp->rx_need_bytes to wait for * the rest of the message. Also, set "early * eaten" since we've already buffered the skb - * but don't consume yet per tcp_read_sock. + * but don't consume yet per strp_read_sock. */ if (!rxm->accum_len) { @@ -329,16 +335,17 @@ static int default_read_sock_done(struct strparser *strp, int err) } /* Called with lock held on lower socket */ -static int strp_tcp_read_sock(struct strparser *strp) +static int strp_read_sock(struct strparser *strp) { + struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; desc.arg.data = strp; desc.error = 0; desc.count = 1; /* give more than one skb per call */ - /* sk should be locked here, so okay to do tcp_read_sock */ - tcp_read_sock(strp->sk, &desc, strp_tcp_recv); + /* sk should be locked here, so okay to do read_sock */ + sock->ops->read_sock(strp->sk, &desc, strp_recv); desc.error = strp->cb.read_sock_done(strp, desc.error); @@ -346,10 +353,8 @@ static int strp_tcp_read_sock(struct strparser *strp) } /* Lower sock lock held */ -void strp_tcp_data_ready(struct strparser *strp) +void strp_data_ready(struct strparser *strp) { - struct sock *csk = strp->sk; - if (unlikely(strp->rx_stopped)) return; @@ -360,7 +365,7 @@ void strp_tcp_data_ready(struct strparser *strp) * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ - if (sock_owned_by_user(csk)) { + if (sock_owned_by_user(strp->sk)) { queue_work(strp_wq, &strp->rx_work); return; } @@ -369,24 +374,24 @@ void strp_tcp_data_ready(struct strparser *strp) return; if (strp->rx_need_bytes) { - if (tcp_inq(csk) >= strp->rx_need_bytes) + if (strp_peek_len(strp) >= strp->rx_need_bytes) strp->rx_need_bytes = 0; else return; } - if (strp_tcp_read_sock(strp) == -ENOMEM) + if (strp_read_sock(strp) == -ENOMEM) queue_work(strp_wq, &strp->rx_work); } -EXPORT_SYMBOL_GPL(strp_tcp_data_ready); +EXPORT_SYMBOL_GPL(strp_data_ready); static void do_strp_rx_work(struct strparser *strp) { read_descriptor_t rd_desc; struct sock *csk = strp->sk; - /* We need the read lock to synchronize with strp_tcp_data_ready. We - * need the socket lock for calling tcp_read_sock. + /* We need the read lock to synchronize with strp_data_ready. We + * need the socket lock for calling strp_read_sock. */ lock_sock(csk); @@ -398,7 +403,7 @@ static void do_strp_rx_work(struct strparser *strp) rd_desc.arg.data = strp; - if (strp_tcp_read_sock(strp) == -ENOMEM) + if (strp_read_sock(strp) == -ENOMEM) queue_work(strp_wq, &strp->rx_work); out: @@ -424,9 +429,14 @@ static void strp_rx_msg_timeout(unsigned long arg) int strp_init(struct strparser *strp, struct sock *csk, struct strp_callbacks *cb) { + struct socket *sock = csk->sk_socket; + if (!cb || !cb->rcv_msg || !cb->parse_msg) return -EINVAL; + if (!sock->ops->read_sock || !sock->ops->peek_len) + return -EAFNOSUPPORT; + memset(strp, 0, sizeof(*strp)); strp->sk = csk; @@ -456,7 +466,7 @@ void strp_unpause(struct strparser *strp) } EXPORT_SYMBOL_GPL(strp_unpause); -/* strp must already be stopped so that strp_tcp_recv will no longer be called. +/* strp must already be stopped so that strp_recv will no longer be called. * Note that strp_done is not called with the lower socket held. */ void strp_done(struct strparser *strp) -- cgit v1.2.3 From c9c3321257e1b95be9b375f811fb250162af8d39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Aug 2016 07:37:54 -0700 Subject: tcp: add tcp_add_backlog() When TCP operates in lossy environments (between 1 and 10 % packet losses), many SACK blocks can be exchanged, and I noticed we could drop them on busy senders, if these SACK blocks have to be queued into the socket backlog. While the main cause is the poor performance of RACK/SACK processing, we can try to avoid these drops of valuable information that can lead to spurious timeouts and retransmits. Cause of the drops is the skb->truesize overestimation caused by : - drivers allocating ~2048 (or more) bytes as a fragment to hold an Ethernet frame. - various pskb_may_pull() calls bringing the headers into skb->head might have pulled all the frame content, but skb->truesize could not be lowered, as the stack has no idea of each fragment truesize. The backlog drops are also more visible on bidirectional flows, since their sk_rmem_alloc can be quite big. Let's add some room for the backlog, as only the socket owner can selectively take action to lower memory needs, like collapsing receive queues or partial ofo pruning. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 33 +++++++++++++++++++++++++++++---- net/ipv6/tcp_ipv6.c | 5 +---- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a5af6be3a572..dd99679e2e51 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1161,6 +1161,7 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp) } bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); #undef STATE_TRACE diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad41e8ecf796..53e80cd004b6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1532,6 +1532,34 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_prequeue); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + + /* In case all data was pulled from skb frags (in __pskb_pull_tail()), + * we can fix skb->truesize to its real value to avoid future drops. + * This is valid because skb is not yet charged to the socket. + * It has been noticed pure SACK packets were sometimes dropped + * (if cooked by drivers without copybreak feature). + */ + if (!skb->data_len) + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + + if (unlikely(sk_add_backlog(sk, skb, limit))) { + bh_unlock_sock(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + return true; + } + return false; +} +EXPORT_SYMBOL(tcp_add_backlog); + /* * From tcp_input.c */ @@ -1662,10 +1690,7 @@ process: if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); - __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); + } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e4f55683af31..5bf460bd299f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1467,10 +1467,7 @@ process: if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); - __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); + } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); -- cgit v1.2.3 From 5711a98221443aec54c4c81ee98c6ae46acccb65 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Ravipati Date: Fri, 26 Aug 2016 01:25:50 -0700 Subject: net: ethtool: add support for 1000BaseX and missing 10G link modes This patch enhances ethtool link mode bitmap to include missing interface modes for 1G/10G speeds Changes: 1000baseX is the mode introduced to cover all 1G Fiber cases. All modes under 1000BaseX i.e. 1000BASE-SX, 1000BASE-LX, 1000BASE-LX10 and 1000BASE-BX10 are not explicitly defined at this moment. 10G CR,SR,LR and ER link modes are included for 10G speed.. Issue: ethtool on 1G/10G SFP port reports Base-T as this port supports 1000baseX,10G CR, SR and LR modes. root@tor-02$ ethtool swp1 Settings for swp1: Supported ports: [ FIBRE ] Supported link modes: 1000baseT/Full 10000baseT/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: Yes Advertised link modes: 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: No Speed: 10000Mb/s Duplex: Full Port: FIBRE PHYAD: 0 Transceiver: external Auto-negotiation: off Current message level: 0x00000000 (0) Link detected: yes After fix: root@tor-02$ ethtool swp1 Settings for swp1: Supported ports: [ FIBRE ] Supported link modes: 1000baseX/Full 10000baseCR/Full 10000baseSR/Full 10000baseLR/Full 10000baseER/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: Yes Advertised link modes: 1000baseT/Full Advertised pause frame use: No Advertised auto-negotiation: No Speed: 10000Mb/s Duplex: Full Port: FIBRE PHYAD: 0 Transceiver: external Auto-negotiation: off Current message level: 0x00000000 (0) Link detected: yes Signed-off-by: Vidya Sagar Ravipati Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b8f38e84d93a..099a4200732c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1362,7 +1362,14 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, - ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, + ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* @@ -1371,7 +1378,7 @@ enum ethtool_link_mode_bit_indices { */ __ETHTOOL_LINK_MODE_LAST - = ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT, + = ETHTOOL_LINK_MODE_10000baseER_Full_BIT, }; #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ -- cgit v1.2.3 From 616b14b46957b52dc7e1f3ec2210d3f9051b1178 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:30 +0200 Subject: netfilter: don't rely on DYING bit to detect when destroy event was sent The reliable event delivery mode currently (ab)uses the DYING bit to detect which entries on the dying list have to be skipped when re-delivering events from the eache worker in reliable event mode. Currently when we delete the conntrack from main table we only set this bit if we could also deliver the netlink destroy event to userspace. If we fail we move it to the dying list, the ecache worker will reattempt event delivery for all confirmed conntracks on the dying list that do not have the DYING bit set. Once timer is gone, we can no longer use if (del_timer()) to detect when we 'stole' the reference count owned by the timer/hash entry, so we need some other way to avoid racing with other cpu. Pablo suggested to add a marker in the ecache extension that skips entries that have been unhashed from main table but are still waiting for the last reference count to be dropped (e.g. because one skb waiting on nfqueue verdict still holds a reference). We do this by adding a tristate. If we fail to deliver the destroy event, make a note of this in the eache extension. The worker can then skip all entries that are in a different state. Either they never delivered a destroy event, e.g. because the netlink backend was not loaded, or redelivery took place already. Once the conntrack timer is removed we will now be able to replace del_timer() test with test_and_set_bit(DYING, &ct->status) to avoid racing with other cpu that tries to evict the same conntrack. Because DYING will then be set right before we report the destroy event we can no longer skip event reporting when dying bit is set. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 17 ++++++++++++----- net/netfilter/nf_conntrack_ecache.c | 22 ++++++++++++++-------- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index fa36447371c6..12d967b58726 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,12 +12,19 @@ #include #include +enum nf_ct_ecache_state { + NFCT_ECACHE_UNKNOWN, /* destroy event not sent */ + NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ + NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */ +}; + struct nf_conntrack_ecache { - unsigned long cache; /* bitops want long */ - unsigned long missed; /* missed events */ - u16 ctmask; /* bitmask of ct events to be delivered */ - u16 expmask; /* bitmask of expect events to be delivered */ - u32 portid; /* netlink portid of destroyer */ + unsigned long cache; /* bitops want long */ + unsigned long missed; /* missed events */ + u16 ctmask; /* bitmask of ct events to be delivered */ + u16 expmask; /* bitmask of expect events to be delivered */ + u32 portid; /* netlink portid of destroyer */ + enum nf_ct_ecache_state state; /* ecache state */ }; static inline struct nf_conntrack_ecache * diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index d28011b42845..da9df2d56e66 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -49,8 +49,13 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_ecache *e; - if (nf_ct_is_dying(ct)) + if (!nf_ct_is_confirmed(ct)) + continue; + + e = nf_ct_ecache_find(ct); + if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; if (nf_conntrack_event(IPCT_DESTROY, ct)) { @@ -58,8 +63,7 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) break; } - /* we've got the event delivered, now it's dying */ - set_bit(IPS_DYING_BIT, &ct->status); + e->state = NFCT_ECACHE_DESTROY_SENT; refs[evicted] = ct; if (++evicted >= ARRAY_SIZE(refs)) { @@ -130,7 +134,7 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, if (!e) goto out_unlock; - if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { + if (nf_ct_is_confirmed(ct)) { struct nf_ct_event item = { .ct = ct, .portid = e->portid ? e->portid : portid, @@ -150,11 +154,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, * triggered by a process, we store the PORTID * to include it in the retransmission. */ - if (eventmask & (1 << IPCT_DESTROY) && - e->portid == 0 && portid != 0) - e->portid = portid; - else + if (eventmask & (1 << IPCT_DESTROY)) { + if (e->portid == 0 && portid != 0) + e->portid = portid; + e->state = NFCT_ECACHE_DESTROY_FAIL; + } else { e->missed |= eventmask; + } } else { e->missed &= ~missed; } -- cgit v1.2.3 From f330a7fdbe1611104622faff7e614a246a7d20f0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:31 +0200 Subject: netfilter: conntrack: get rid of conntrack timer With stats enabled this eats 80 bytes on x86_64 per nf_conn entry, as Eric Dumazet pointed out during netfilter workshop 2016. Eric also says: "Another reason was the fact that Thomas was about to change max timer range [..]" (500462a9de657f8, 'timers: Switch to a non-cascading wheel'). Remove the timer and use a 32bit jiffies value containing timestamp until entry is valid. During conntrack lookup, even before doing tuple comparision, check the timeout value and evict the entry in case it is too old. The dying bit is used as a synchronization point to avoid races where multiple cpus try to evict the same entry. Because lookup is always lockless, we need to bump the refcnt once when we evict, else we could try to evict already-dead entry that is being recycled. This is the standard/expected way when conntrack entries are destroyed. Followup patches will introduce garbage colliction via work queue and further places where we can reap obsoleted entries (e.g. during netlink dumps), this is needed to avoid expired conntracks from hanging around for too long when lookup rate is low after a busy period. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 23 +++++++-- net/netfilter/nf_conntrack_core.c | 91 ++++++++++++++++++++---------------- net/netfilter/nf_conntrack_netlink.c | 14 ++---- net/netfilter/nf_conntrack_pptp.c | 3 +- net/netfilter/nf_nat_core.c | 6 --- 5 files changed, 74 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 2a127480d4cc..7277751128e8 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -42,7 +42,6 @@ union nf_conntrack_expect_proto { #include #include -#include #ifdef CONFIG_NETFILTER_DEBUG #define NF_CT_ASSERT(x) WARN_ON(!(x)) @@ -73,7 +72,7 @@ struct nf_conn_help { #include struct nf_conn { - /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, + /* Usage count in here is 1 for hash table, 1 per skb, * plus 1 for any connection(s) we are `master' for * * Hint, SKB address this struct and refcnt via skb->nfct and @@ -96,8 +95,8 @@ struct nf_conn { /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; - /* Timer function; drops refcnt when it goes off. */ - struct timer_list timeout; + /* jiffies32 when this ct is considered dead */ + u32 timeout; possible_net_t ct_net; @@ -291,14 +290,28 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +#define nfct_time_stamp ((u32)(jiffies)) + /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { - long timeout = (long)ct->timeout.expires - (long)jiffies; + s32 timeout = ct->timeout - nfct_time_stamp; return timeout > 0 ? timeout : 0; } +static inline bool nf_ct_is_expired(const struct nf_conn *ct) +{ + return (__s32)(ct->timeout - nfct_time_stamp) <= 0; +} + +/* use after obtaining a reference count */ +static inline bool nf_ct_should_gc(const struct nf_conn *ct) +{ + return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && + !nf_ct_is_dying(ct); +} + struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 887926aefc72..87ee6dad777c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -371,7 +371,6 @@ destroy_conntrack(struct nf_conntrack *nfct) pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); - NF_CT_ASSERT(!timer_pending(&ct->timeout)); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -434,35 +433,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; + if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) + return false; + tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_get_real_ns(); - if (nf_ct_is_dying(ct)) - goto delete; - if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { - /* destroy event was not delivered */ + /* destroy event was not delivered. nf_ct_put will + * be done by event cache worker on redelivery. + */ nf_ct_delete_from_lists(ct); nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); return false; } nf_conntrack_ecache_work(nf_ct_net(ct)); - set_bit(IPS_DYING_BIT, &ct->status); - delete: nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; } EXPORT_SYMBOL_GPL(nf_ct_delete); -static void death_by_timeout(unsigned long ul_conntrack) -{ - nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); -} - static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, @@ -480,6 +474,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +/* caller must hold rcu readlock and none of the nf_conntrack_locks */ +static void nf_ct_gc_expired(struct nf_conn *ct) +{ + if (!atomic_inc_not_zero(&ct->ct_general.use)) + return; + + if (nf_ct_should_gc(ct)) + nf_ct_kill(ct); + + nf_ct_put(ct); +} + /* * Warning : * - Caller must take a reference on returned object @@ -499,6 +505,17 @@ begin: bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { + struct nf_conn *ct; + + ct = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(ct)) { + nf_ct_gc_expired(ct); + continue; + } + + if (nf_ct_is_dying(ct)) + continue; + if (nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); return h; @@ -597,7 +614,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone, net)) goto out; - add_timer(&ct->timeout); smp_wmb(); /* The caller holds a reference to this object */ atomic_set(&ct->ct_general.use, 2); @@ -750,8 +766,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); + ct->timeout += nfct_time_stamp; atomic_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; @@ -815,8 +830,16 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); - if (ct != ignored_conntrack && - nf_ct_key_equal(h, tuple, zone, net)) { + + if (ct == ignored_conntrack) + continue; + + if (nf_ct_is_expired(ct)) { + nf_ct_gc_expired(ct); + continue; + } + + if (nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; @@ -850,6 +873,11 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(tmp)) { + nf_ct_gc_expired(tmp); + continue; + } + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || !net_eq(nf_ct_net(tmp), net) || nf_ct_is_dying(tmp)) @@ -867,7 +895,6 @@ static unsigned int early_drop_list(struct net *net, */ if (net_eq(nf_ct_net(tmp), net) && nf_ct_is_confirmed(tmp) && - del_timer(&tmp->timeout) && nf_ct_delete(tmp, 0, 0)) drops++; @@ -937,8 +964,6 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; - /* Don't set timer yet: wait for confirmation */ - setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - @@ -1312,7 +1337,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, unsigned long extra_jiffies, int do_acct) { - NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); NF_CT_ASSERT(skb); /* Only update if this is not a fixed timeout */ @@ -1320,18 +1344,10 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, goto acct; /* If not in hash table, timer will not be active yet */ - if (!nf_ct_is_confirmed(ct)) { - ct->timeout.expires = extra_jiffies; - } else { - unsigned long newtime = jiffies + extra_jiffies; - - /* Only update the timeout if the new timeout is at least - HZ jiffies from the old timeout. Need del_timer for race - avoidance (may already be dying). */ - if (newtime - ct->timeout.expires >= HZ) - mod_timer_pending(&ct->timeout, newtime); - } + if (nf_ct_is_confirmed(ct)) + extra_jiffies += nfct_time_stamp; + ct->timeout = extra_jiffies; acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); @@ -1346,11 +1362,7 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); - if (del_timer(&ct->timeout)) { - ct->timeout.function((unsigned long)ct); - return true; - } - return false; + return nf_ct_delete(ct, 0, 0); } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); @@ -1485,11 +1497,8 @@ void nf_ct_iterate_cleanup(struct net *net, while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ - if (del_timer(&ct->timeout)) - nf_ct_delete(ct, portid, report); - - /* ... else the timer will get him soon. */ + nf_ct_delete(ct, portid, report); nf_ct_put(ct); cond_resched(); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 68800c10a320..81fd34ce0a57 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1144,9 +1144,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } } - if (del_timer(&ct->timeout)) - nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); - + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); return 0; @@ -1514,11 +1512,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct, { u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); - if (!del_timer(&ct->timeout)) - return -ETIME; + ct->timeout = nfct_time_stamp + timeout * HZ; - ct->timeout.expires = jiffies + timeout * HZ; - add_timer(&ct->timeout); + if (test_bit(IPS_DYING_BIT, &ct->status)) + return -ETIME; return 0; } @@ -1716,9 +1713,8 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); - ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; rcu_read_lock(); if (cda[CTA_HELP]) { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 5588c7ae1ac2..f60a4755d71e 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, pr_debug("setting timeout of conntrack %p to 0\n", sibling); sibling->proto.gre.timeout = 0; sibling->proto.gre.stream_timeout = 0; - if (del_timer(&sibling->timeout)) - sibling->timeout.function((unsigned long)sibling); + nf_ct_kill(sibling); nf_ct_put(sibling); return 1; } else { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index de31818417b8..81ae41f85d3a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -565,16 +565,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ - if (!del_timer(&ct->timeout)) - return 1; - ct->status &= ~IPS_NAT_DONE_MASK; - rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, nf_nat_bysource_params); - add_timer(&ct->timeout); - /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ -- cgit v1.2.3 From ad66713f5a20034b3b2a0cbc184319b2ede93f11 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:35 +0200 Subject: netfilter: remove __nf_ct_kill_acct helper After timer removal this just calls nf_ct_delete so remove the __ prefix version and make nf_ct_kill a shorthand for nf_ct_delete. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 13 +++---------- net/netfilter/nf_conntrack_core.c | 12 +++++------- 2 files changed, 8 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 7277751128e8..50418052a520 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -219,21 +219,14 @@ static inline void nf_ct_refresh(struct nf_conn *ct, __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, 0); } -bool __nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, int do_acct); - /* kill conntrack and do accounting */ -static inline bool nf_ct_kill_acct(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb) -{ - return __nf_ct_kill_acct(ct, ctinfo, skb, 1); -} +bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct sk_buff *skb); /* kill conntrack without accounting */ static inline bool nf_ct_kill(struct nf_conn *ct) { - return __nf_ct_kill_acct(ct, 0, NULL, 0); + return nf_ct_delete(ct, 0, 0); } /* These are for NAT. Icky. */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7c66ce401ce9..ac1db4019d5c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1430,17 +1430,15 @@ acct: } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); -bool __nf_ct_kill_acct(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - int do_acct) +bool nf_ct_kill_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) { - if (do_acct) - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, ctinfo, skb->len); return nf_ct_delete(ct, 0, 0); } -EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); +EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) -- cgit v1.2.3 From 779994fa3636d46848edb402fe7517968e036e6f Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 29 Aug 2016 18:25:28 +0800 Subject: netfilter: log: Check param to avoid overflow in nf_log_set The nf_log_set is an interface function, so it should do the strict sanity check of parameters. Convert the return value of nf_log_set as int instead of void. When the pf is invalid, return -EOPNOTSUPP. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 3 +-- net/bridge/netfilter/nf_log_bridge.c | 3 +-- net/ipv4/netfilter/nf_log_arp.c | 3 +-- net/ipv4/netfilter/nf_log_ipv4.c | 3 +-- net/ipv6/netfilter/nf_log_ipv6.c | 3 +-- net/netfilter/nf_log.c | 8 +++++--- 6 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 83d855ba6af1..ee07dc8b0a7b 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -60,8 +60,7 @@ struct nf_logger { int nf_log_register(u_int8_t pf, struct nf_logger *logger); void nf_log_unregister(struct nf_logger *logger); -void nf_log_set(struct net *net, u_int8_t pf, - const struct nf_logger *logger); +int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unset(struct net *net, const struct nf_logger *logger); int nf_log_bind_pf(struct net *net, u_int8_t pf, diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index 5d9953a90929..1663df598545 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -50,8 +50,7 @@ static struct nf_logger nf_bridge_logger __read_mostly = { static int __net_init nf_log_bridge_net_init(struct net *net) { - nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); - return 0; + return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); } static void __net_exit nf_log_bridge_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index cf8f2d4e867a..8945c2653814 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -111,8 +111,7 @@ static struct nf_logger nf_arp_logger __read_mostly = { static int __net_init nf_log_arp_net_init(struct net *net) { - nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); - return 0; + return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); } static void __net_exit nf_log_arp_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 076aadda0473..20f225593a8b 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -347,8 +347,7 @@ static struct nf_logger nf_ip_logger __read_mostly = { static int __net_init nf_log_ipv4_net_init(struct net *net) { - nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); - return 0; + return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); } static void __net_exit nf_log_ipv4_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 8dd869642f45..c1bcf699a23d 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -379,8 +379,7 @@ static struct nf_logger nf_ip6_logger __read_mostly = { static int __net_init nf_log_ipv6_net_init(struct net *net) { - nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); - return 0; + return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); } static void __net_exit nf_log_ipv6_net_exit(struct net *net) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index aa5847a16713..30a17d649a83 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -39,12 +39,12 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) return NULL; } -void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) { const struct nf_logger *log; - if (pf == NFPROTO_UNSPEC) - return; + if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers)) + return -EOPNOTSUPP; mutex_lock(&nf_log_mutex); log = nft_log_dereference(net->nf.nf_loggers[pf]); @@ -52,6 +52,8 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); + + return 0; } EXPORT_SYMBOL(nf_log_set); -- cgit v1.2.3 From e34d4234b0b77a8a8b6dd7cf29aff468c288d9e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Aug 2016 09:49:29 +0100 Subject: rxrpc: Trace rxrpc_call usage Add a trace event for debuging rxrpc_call struct usage. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 39 +++++++++++++++++++ net/rxrpc/ar-internal.h | 19 +++------- net/rxrpc/call_accept.c | 5 ++- net/rxrpc/call_event.c | 11 +++--- net/rxrpc/call_object.c | 90 ++++++++++++++++++++++++++++++++++++++++---- net/rxrpc/conn_client.c | 1 + net/rxrpc/conn_event.c | 1 + net/rxrpc/input.c | 4 +- net/rxrpc/output.c | 1 + net/rxrpc/peer_event.c | 1 + net/rxrpc/recvmsg.c | 1 + net/rxrpc/skbuff.c | 4 +- 12 files changed, 143 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 15283ee3e41a..cbe574ea674b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -16,6 +16,45 @@ #include +TRACE_EVENT(rxrpc_call, + TP_PROTO(struct rxrpc_call *call, int op, int usage, int nskb, + const void *where, const void *aux), + + TP_ARGS(call, op, usage, nskb, where, aux), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(int, op ) + __field(int, usage ) + __field(int, nskb ) + __field(const void *, where ) + __field(const void *, aux ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->op = op; + __entry->usage = usage; + __entry->nskb = nskb; + __entry->where = where; + __entry->aux = aux; + ), + + TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p", + __entry->call, + (__entry->op == 0 ? "NWc" : + __entry->op == 1 ? "NWs" : + __entry->op == 2 ? "SEE" : + __entry->op == 3 ? "GET" : + __entry->op == 4 ? "Gsb" : + __entry->op == 5 ? "PUT" : + "Psb"), + __entry->usage, + __entry->nskb, + __entry->where, + __entry->aux) + ); + TRACE_EVENT(rxrpc_skb, TP_PROTO(struct sk_buff *skb, int op, int usage, int mod_count, const void *where), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ce6afd931e91..0c320b2b7b43 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -543,7 +543,11 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); -void __rxrpc_put_call(struct rxrpc_call *); +void rxrpc_see_call(struct rxrpc_call *); +void rxrpc_get_call(struct rxrpc_call *); +void rxrpc_put_call(struct rxrpc_call *); +void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *); +void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *); void __exit rxrpc_destroy_all_calls(void); static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) @@ -1022,16 +1026,3 @@ do { \ } while (0) #endif /* __KDEBUGALL */ - - -#define rxrpc_get_call(CALL) \ -do { \ - CHECK_SLAB_OKAY(&(CALL)->usage); \ - if (atomic_inc_return(&(CALL)->usage) == 1) \ - BUG(); \ -} while (0) - -#define rxrpc_put_call(CALL) \ -do { \ - __rxrpc_put_call(CALL); \ -} while (0) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index ef9ef0d6c917..03af88fe798b 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -129,8 +129,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, _debug("conn ready"); call->state = RXRPC_CALL_SERVER_ACCEPTING; list_add_tail(&call->accept_link, &rx->acceptq); - rxrpc_get_call(call); - atomic_inc(&call->skb_count); + rxrpc_get_call_for_skb(call, notification); nsp = rxrpc_skb(notification); nsp->call = call; @@ -323,6 +322,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); + rxrpc_see_call(call); write_lock_bh(&call->state_lock); switch (call->state) { @@ -395,6 +395,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); + rxrpc_see_call(call); write_lock_bh(&call->state_lock); switch (call->state) { diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 94c7751fd99a..02fe4a4b60d9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -465,8 +465,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call, skb->destructor = rxrpc_packet_destructor; ASSERTCMP(sp->call, ==, NULL); sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); + rxrpc_get_call_for_skb(call, skb); /* insert into the buffer in sequence order */ spin_lock_bh(&call->lock); @@ -741,8 +740,7 @@ all_acked: _debug("post ACK"); skb->mark = RXRPC_SKB_MARK_FINAL_ACK; sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); + rxrpc_get_call_for_skb(call, skb); spin_lock_bh(&call->lock); if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) BUG(); @@ -801,8 +799,7 @@ static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, memset(sp, 0, sizeof(*sp)); sp->error = error; sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); + rxrpc_get_call_for_skb(call, skb); spin_lock_bh(&call->lock); ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); @@ -834,6 +831,8 @@ void rxrpc_process_call(struct work_struct *work) u32 serial, abort_code = RX_PROTOCOL_ERROR; u8 *acks = NULL; + rxrpc_see_call(call); + //printk("\n--------------------\n"); _enter("{%d,%s,%lx} [%lu]", call->debug_id, rxrpc_call_states[call->state], call->events, diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 852c30dc7b75..104ee8b1de06 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -219,6 +219,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, { struct rxrpc_call *call, *xcall; struct rb_node *parent, **pp; + const void *here = __builtin_return_address(0); int ret; _enter("%p,%lx", rx, user_call_ID); @@ -229,6 +230,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } + trace_rxrpc_call(call, 0, atomic_read(&call->usage), 0, here, + (const void *)user_call_ID); + /* Publish the call, even though it is incompletely set up as yet */ call->user_call_ID = user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); @@ -308,6 +312,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call, *candidate; + const void *here = __builtin_return_address(0); u32 call_id, chan; _enter(",%d", conn->debug_id); @@ -318,6 +323,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, if (!candidate) return ERR_PTR(-EBUSY); + trace_rxrpc_call(candidate, 1, atomic_read(&candidate->usage), + 0, here, NULL); + chan = sp->hdr.cid & RXRPC_CHANNELMASK; candidate->socket = rx; candidate->conn = conn; @@ -430,6 +438,44 @@ old_call: return ERR_PTR(-ECONNRESET); } +/* + * Note the re-emergence of a call. + */ +void rxrpc_see_call(struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); + if (call) { + int n = atomic_read(&call->usage); + int m = atomic_read(&call->skb_count); + + trace_rxrpc_call(call, 2, n, m, here, 0); + } +} + +/* + * Note the addition of a ref on a call. + */ +void rxrpc_get_call(struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&call->usage); + int m = atomic_read(&call->skb_count); + + trace_rxrpc_call(call, 3, n, m, here, 0); +} + +/* + * Note the addition of a ref on a call for a socket buffer. + */ +void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&call->usage); + int m = atomic_inc_return(&call->skb_count); + + trace_rxrpc_call(call, 4, n, m, here, skb); +} + /* * detach a call from a socket and set up for release */ @@ -443,6 +489,8 @@ void rxrpc_release_call(struct rxrpc_call *call) atomic_read(&call->ackr_not_idle), call->rx_first_oos); + rxrpc_see_call(call); + spin_lock_bh(&call->lock); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); @@ -526,6 +574,7 @@ static void rxrpc_dead_call_expired(unsigned long _call) _enter("{%d}", call->debug_id); + rxrpc_see_call(call); write_lock_bh(&call->state_lock); call->state = RXRPC_CALL_DEAD; write_unlock_bh(&call->state_lock); @@ -540,6 +589,7 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call) { bool sched; + rxrpc_see_call(call); write_lock(&call->state_lock); if (call->state < RXRPC_CALL_DEAD) { sched = __rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET); @@ -585,21 +635,43 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) /* * release a call */ -void __rxrpc_put_call(struct rxrpc_call *call) +void rxrpc_put_call(struct rxrpc_call *call) { - ASSERT(call != NULL); + const void *here = __builtin_return_address(0); + int n, m; - _enter("%p{u=%d}", call, atomic_read(&call->usage)); + ASSERT(call != NULL); - ASSERTCMP(atomic_read(&call->usage), >, 0); + n = atomic_dec_return(&call->usage); + m = atomic_read(&call->skb_count); + trace_rxrpc_call(call, 5, n, m, here, NULL); + ASSERTCMP(n, >=, 0); + if (n == 0) { + _debug("call %d dead", call->debug_id); + WARN_ON(m != 0); + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + rxrpc_queue_work(&call->destroyer); + } +} - if (atomic_dec_and_test(&call->usage)) { +/* + * Release a call ref held by a socket buffer. + */ +void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) +{ + const void *here = __builtin_return_address(0); + int n, m; + + n = atomic_dec_return(&call->usage); + m = atomic_dec_return(&call->skb_count); + trace_rxrpc_call(call, 6, n, m, here, skb); + ASSERTCMP(n, >=, 0); + if (n == 0) { _debug("call %d dead", call->debug_id); - WARN_ON(atomic_read(&call->skb_count) != 0); + WARN_ON(m != 0); ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); rxrpc_queue_work(&call->destroyer); } - _leave(""); } /* @@ -705,6 +777,7 @@ void __exit rxrpc_destroy_all_calls(void) call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); _debug("Zapping call %p", call); + rxrpc_see_call(call); list_del_init(&call->link); switch (atomic_read(&call->usage)) { @@ -748,6 +821,7 @@ static void rxrpc_call_life_expired(unsigned long _call) _enter("{%d}", call->debug_id); + rxrpc_see_call(call); if (call->state >= RXRPC_CALL_COMPLETE) return; @@ -765,6 +839,7 @@ static void rxrpc_resend_time_expired(unsigned long _call) _enter("{%d}", call->debug_id); + rxrpc_see_call(call); if (call->state >= RXRPC_CALL_COMPLETE) return; @@ -782,6 +857,7 @@ static void rxrpc_ack_time_expired(unsigned long _call) _enter("{%d}", call->debug_id); + rxrpc_see_call(call); if (call->state >= RXRPC_CALL_COMPLETE) return; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 44850a2d90b5..4b213bc0f554 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -537,6 +537,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, struct rxrpc_call, chan_wait_link); u32 call_id = chan->call_counter + 1; + rxrpc_see_call(call); list_del_init(&call->chan_wait_link); conn->active_chans |= 1 << channel; call->peer = rxrpc_get_peer(conn->params.peer); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index bcea99c73b40..bc9b05938ff5 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -157,6 +157,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, conn->channels[i].call, lockdep_is_held(&conn->channel_lock)); if (call) { + rxrpc_see_call(call); write_lock_bh(&call->state_lock); if (rxrpc_set_call_completion(call, compl, abort_code, error)) { diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index af49c2992c4a..86bea9ad6c3d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -196,8 +196,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, goto enqueue_packet; sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); + rxrpc_get_call_for_skb(call, skb); terminal = ((flags & RXRPC_LAST_PACKET) && !(flags & RXRPC_CLIENT_INITIATED)); ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); @@ -748,6 +747,7 @@ void rxrpc_data_ready(struct sock *sk) if (!call || atomic_read(&call->usage) == 0) goto cant_route_call; + rxrpc_see_call(call); rxrpc_post_packet_to_call(call, skb); goto out_unlock; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 036e1112b0c5..888fa87ed1d6 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -207,6 +207,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); } + rxrpc_see_call(call); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 865078d76ad3..27b9ecad007e 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -270,6 +270,7 @@ void rxrpc_peer_error_distributor(struct work_struct *work) call = hlist_entry(peer->error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); + rxrpc_see_call(call); queue = false; write_lock(&call->state_lock); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 96d98a3a7087..c9b38c7fb448 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -115,6 +115,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sp = rxrpc_skb(skb); call = sp->call; ASSERT(call != NULL); + rxrpc_see_call(call); _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index fbd8c74d9505..20529205bb8c 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -140,9 +140,7 @@ void rxrpc_packet_destructor(struct sk_buff *skb) _enter("%p{%p}", skb, call); if (call) { - if (atomic_dec_return(&call->skb_count) < 0) - BUG(); - rxrpc_put_call(call); + rxrpc_put_call_for_skb(call, skb); sp->call = NULL; } -- cgit v1.2.3 From 8324f0bcfbfc645cf248e4b93ab58341b7d3b135 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Aug 2016 09:49:29 +0100 Subject: rxrpc: Provide a way for AFS to ask for the peer address of a call Provide a function so that kernel users, such as AFS, can ask for the peer address of a call: void rxrpc_kernel_get_peer(struct rxrpc_call *call, struct sockaddr_rxrpc *_srx); In the future the kernel service won't get sk_buffs to look inside. Further, this allows us to hide any canonicalisation inside AF_RXRPC for when IPv6 support is added. Also propagate this through to afs_find_server() and issue a warning if we can't handle the address family yet. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 7 +++++++ fs/afs/cmservice.c | 20 +++++++++++--------- fs/afs/internal.h | 5 ++++- fs/afs/rxrpc.c | 2 +- fs/afs/server.c | 11 ++++++++--- include/net/af_rxrpc.h | 2 ++ net/rxrpc/peer_object.c | 15 +++++++++++++++ 7 files changed, 48 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 70c926ae212d..dfe0b008df74 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -868,6 +868,13 @@ The kernel interface functions are as follows: This is used to allocate a null RxRPC key that can be used to indicate anonymous security for a particular domain. + (*) Get the peer address of a call. + + void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, + struct sockaddr_rxrpc *_srx); + + This is used to find the remote peer address of a call. + ======================= CONFIGURABLE PARAMETERS diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index ca32d891bbc3..77ee481059ac 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -167,9 +167,9 @@ static void SRXAFSCB_CallBack(struct work_struct *work) static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, bool last) { + struct sockaddr_rxrpc srx; struct afs_callback *cb; struct afs_server *server; - struct in_addr addr; __be32 *bp; u32 tmp; int ret, loop; @@ -178,6 +178,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, switch (call->unmarshall) { case 0: + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); call->offset = 0; call->unmarshall++; @@ -282,8 +283,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -314,12 +314,14 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, struct sk_buff *skb, bool last) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; int ret; _enter(",{%u},%d", skb->len, last); + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + ret = afs_data_complete(call, skb, last); if (ret < 0) return ret; @@ -329,8 +331,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -347,11 +348,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, struct sk_buff *skb, bool last) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; _enter(",{%u},%d", skb->len, last); + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + /* There are some arguments that we ignore */ afs_data_consumed(call, skb); if (!last) @@ -362,8 +365,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index df976b2a7f40..d97552de9c59 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "afs.h" #include "afs_vl.h" @@ -607,6 +608,8 @@ extern void afs_proc_cell_remove(struct afs_cell *); /* * rxrpc.c */ +extern struct socket *afs_socket; + extern int afs_open_socket(void); extern void afs_close_socket(void); extern void afs_data_consumed(struct afs_call *, struct sk_buff *); @@ -654,7 +657,7 @@ do { \ extern struct afs_server *afs_lookup_server(struct afs_cell *, const struct in_addr *); -extern struct afs_server *afs_find_server(const struct in_addr *); +extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *); extern void afs_put_server(struct afs_server *); extern void __exit afs_purge_servers(void); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 14d04c848465..a1916750e2f9 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,7 +16,7 @@ #include "internal.h" #include "afs_cm.h" -static struct socket *afs_socket; /* my RxRPC socket */ +struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; static atomic_t afs_outstanding_calls; static atomic_t afs_outstanding_skbs; diff --git a/fs/afs/server.c b/fs/afs/server.c index f342acf3547d..d4066ab7dd55 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -178,13 +178,18 @@ server_in_two_cells: /* * look up a server by its IP address */ -struct afs_server *afs_find_server(const struct in_addr *_addr) +struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx) { struct afs_server *server = NULL; struct rb_node *p; - struct in_addr addr = *_addr; + struct in_addr addr = srx->transport.sin.sin_addr; - _enter("%pI4", &addr.s_addr); + _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr); + + if (srx->transport.family != AF_INET) { + WARN(true, "AFS does not yes support non-IPv4 addresses\n"); + return NULL; + } read_lock(&afs_servers_lock); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 7b0f88699b25..f9224e835d43 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -49,5 +49,7 @@ int rxrpc_kernel_get_error_number(struct sk_buff *); void rxrpc_kernel_free_skb(struct sk_buff *); struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long); int rxrpc_kernel_reject_call(struct socket *); +void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, + struct sockaddr_rxrpc *); #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 538e9831c699..aebc73ac16dc 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -313,3 +313,18 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer) kfree_rcu(peer, rcu); } + +/** + * rxrpc_kernel_get_peer - Get the peer address of a call + * @sock: The socket on which the call is in progress. + * @call: The call to query + * @_srx: Where to place the result + * + * Get the address of the remote peer in a call. + */ +void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, + struct sockaddr_rxrpc *_srx) +{ + *_srx = call->peer->srx; +} +EXPORT_SYMBOL(rxrpc_kernel_get_peer); -- cgit v1.2.3 From 4de48af663d88d8c9a2550e60725f5a5c660970b Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Aug 2016 12:00:48 +0100 Subject: rxrpc: Pass struct socket * to more rxrpc kernel interface functions Pass struct socket * to more rxrpc kernel interface functions. They should be starting from this rather than the socket pointer in the rxrpc_call struct if they need to access the socket. I have left: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() unmodified as they're all about to be removed (and, in any case, don't touch the socket). Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 11 ++++++++--- fs/afs/rxrpc.c | 26 +++++++++++++++----------- include/net/af_rxrpc.h | 10 +++++++--- net/rxrpc/af_rxrpc.c | 5 +++-- net/rxrpc/output.c | 20 +++++++++++--------- 5 files changed, 44 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index dfe0b008df74..cfc8cb91452f 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -725,7 +725,8 @@ The kernel interface functions are as follows: (*) End a client call. - void rxrpc_kernel_end_call(struct rxrpc_call *call); + void rxrpc_kernel_end_call(struct socket *sock, + struct rxrpc_call *call); This is used to end a previously begun call. The user_call_ID is expunged from AF_RXRPC's knowledge and will not be seen again in association with @@ -733,7 +734,9 @@ The kernel interface functions are as follows: (*) Send data through a call. - int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, + int rxrpc_kernel_send_data(struct socket *sock, + struct rxrpc_call *call, + struct msghdr *msg, size_t len); This is used to supply either the request part of a client call or the @@ -747,7 +750,9 @@ The kernel interface functions are as follows: (*) Abort a call. - void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code); + void rxrpc_kernel_abort_call(struct socket *sock, + struct rxrpc_call *call, + u32 abort_code); This is used to abort a call if it's still in an abortable state. The abort code specified will be placed in the ABORT message sent. diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a1916750e2f9..7b0d18900f50 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -207,7 +207,7 @@ static void afs_free_call(struct afs_call *call) static void afs_end_call_nofree(struct afs_call *call) { if (call->rxcall) { - rxrpc_kernel_end_call(call->rxcall); + rxrpc_kernel_end_call(afs_socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) @@ -325,8 +325,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, * returns from sending the request */ if (first + loop >= last) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(call->rxcall, msg, - to - offset); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, to - offset); kunmap(pages[loop]); if (ret < 0) break; @@ -406,7 +406,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, * request */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + ret = rxrpc_kernel_send_data(afs_socket, rxcall, + &msg, call->request_size); if (ret < 0) goto error_do_abort; @@ -421,7 +422,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return wait_mode->wait(call); error_do_abort: - rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); error_kill_call: @@ -509,7 +510,8 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; do_abort: - rxrpc_kernel_abort_call(call->rxcall, + rxrpc_kernel_abort_call(afs_socket, + call->rxcall, abort_code); call->error = ret; call->state = AFS_CALL_ERROR; @@ -605,7 +607,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* kill the call */ if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); - rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, RX_CALL_DEAD); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); } @@ -823,14 +825,15 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT); default: afs_end_call(call); _leave(" [error]"); @@ -859,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); if (n >= 0) { /* Success */ _leave(" [replied]"); @@ -868,7 +871,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT); } afs_end_call(call); _leave(" [error]"); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f9224e835d43..f8d8079dc058 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -15,6 +15,9 @@ #include #include +struct key; +struct sock; +struct socket; struct rxrpc_call; /* @@ -39,10 +42,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, struct key *, unsigned long, gfp_t); -int rxrpc_kernel_send_data(struct rxrpc_call *, struct msghdr *, size_t); +int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, + struct msghdr *, size_t); void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); -void rxrpc_kernel_abort_call(struct rxrpc_call *, u32); -void rxrpc_kernel_end_call(struct rxrpc_call *); +void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32); +void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); bool rxrpc_kernel_is_data_last(struct sk_buff *); u32 rxrpc_kernel_get_abort_code(struct sk_buff *); int rxrpc_kernel_get_error_number(struct sk_buff *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index c7cf356b42b8..e07c91acd904 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -279,15 +279,16 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call); /** * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using + * @sock: The socket the call is on * @call: The call to end * * Allow a kernel service to end a call it was using. The call must be * complete before this is called (the call should be aborted if necessary). */ -void rxrpc_kernel_end_call(struct rxrpc_call *call) +void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); - rxrpc_remove_user_ID(call->socket, call); + rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call); rxrpc_put_call(call); } EXPORT_SYMBOL(rxrpc_kernel_end_call); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 888fa87ed1d6..b1e708a12151 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -239,6 +239,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /** * rxrpc_kernel_send_data - Allow a kernel service to send data on a call + * @sock: The socket the call is on * @call: The call to send data through * @msg: The data to send * @len: The amount of data to send @@ -248,8 +249,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. */ -int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, - size_t len) +int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, + struct msghdr *msg, size_t len) { int ret; @@ -258,7 +259,7 @@ int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); - lock_sock(&call->socket->sk); + lock_sock(sock->sk); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); @@ -270,35 +271,36 @@ int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, call->state != RXRPC_CALL_SERVER_SEND_REPLY) { ret = -EPROTO; /* request phase complete for this client call */ } else { - ret = rxrpc_send_data(call->socket, call, msg, len); + ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); } - release_sock(&call->socket->sk); + release_sock(sock->sk); _leave(" = %d", ret); return ret; } - EXPORT_SYMBOL(rxrpc_kernel_send_data); /** * rxrpc_kernel_abort_call - Allow a kernel service to abort a call + * @sock: The socket the call is on * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * * Allow a kernel service to abort a call, if it's still in an abortable state. */ -void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) +void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, + u32 abort_code) { _enter("{%d},%d", call->debug_id, abort_code); - lock_sock(&call->socket->sk); + lock_sock(sock->sk); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); rxrpc_send_abort(call, abort_code); - release_sock(&call->socket->sk); + release_sock(sock->sk); _leave(""); } -- cgit v1.2.3 From 14972cbd34ff668c390cbd2e6497323484c9e812 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 24 Aug 2016 20:10:43 -0700 Subject: net: lwtunnel: Handle fragmentation Today mpls iptunnel lwtunnel_output redirect expects the tunnel output function to handle fragmentation. This is ok but can be avoided if we did not do the mpls output redirect too early. ie we could wait until ip fragmentation is done and then call mpls output for each ip fragment. To make this work we will need, 1) the lwtunnel state to carry encap headroom 2) and do the redirect to the encap output handler on the ip fragment (essentially do the output redirect after fragmentation) This patch adds tunnel headroom in lwtstate to make sure we account for tunnel data in mtu calculations during fragmentation and adds new xmit redirect handler to redirect to lwtunnel xmit func after ip fragmentation. This includes IPV6 and some mtu fixes and testing from David Ahern. Signed-off-by: Roopa Prabhu Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ net/core/lwtunnel.c | 35 +++++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 8 ++++++++ net/ipv4/route.c | 4 +++- net/ipv6/ip6_output.c | 8 ++++++++ net/ipv6/route.c | 4 +++- net/mpls/mpls_iptunnel.c | 9 +++++---- 7 files changed, 106 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index e9f116e29c22..ea3f80f58fd6 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -13,6 +13,13 @@ /* lw tunnel state flags */ #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) +#define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) + +enum { + LWTUNNEL_XMIT_DONE, + LWTUNNEL_XMIT_CONTINUE, +}; + struct lwtunnel_state { __u16 type; @@ -21,6 +28,7 @@ struct lwtunnel_state { int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); int len; + __u16 headroom; __u8 data[0]; }; @@ -34,6 +42,7 @@ struct lwtunnel_encap_ops { struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); + int (*xmit)(struct sk_buff *skb); }; #ifdef CONFIG_LWTUNNEL @@ -75,6 +84,24 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) return false; } + +static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) + return true; + + return false; +} + +static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, + unsigned int mtu) +{ + if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu) + return lwtstate->headroom; + + return 0; +} + int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, @@ -90,6 +117,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); +int lwtunnel_xmit(struct sk_buff *skb); #else @@ -117,6 +145,17 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + +static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, + unsigned int mtu) +{ + return 0; +} + static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { @@ -170,6 +209,11 @@ static inline int lwtunnel_input(struct sk_buff *skb) return -EOPNOTSUPP; } +static inline int lwtunnel_xmit(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LWTUNNEL */ #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 669ecc9f884e..e5f84c26ba1a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -251,6 +251,41 @@ drop: } EXPORT_SYMBOL(lwtunnel_output); +int lwtunnel_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->xmit)) + ret = ops->xmit(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_xmit); + int lwtunnel_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index dde37fb340bf..65569274efb8 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -197,6 +198,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s skb = skb2; } + if (lwtunnel_xmit_redirect(dst->lwtstate)) { + int res = lwtunnel_xmit(skb); + + if (res < 0 || res == LWTUNNEL_XMIT_DONE) + return res; + } + rcu_read_lock_bh(); nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1f2830d8110..3e992783c1d0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1246,7 +1246,9 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } - return min_t(unsigned int, mtu, IP_MAX_MTU); + mtu = min_t(unsigned int, mtu, IP_MAX_MTU); + + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1dfc402d9ad1..993fd9666f1b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,6 +56,7 @@ #include #include #include +#include static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -104,6 +105,13 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } } + if (lwtunnel_xmit_redirect(dst->lwtstate)) { + int res = lwtunnel_xmit(skb); + + if (res < 0 || res == LWTUNNEL_XMIT_DONE) + return res; + } + rcu_read_lock_bh(); nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 49817555449e..09d43ff11a8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1604,7 +1604,9 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) rcu_read_unlock(); out: - return min_t(unsigned int, mtu, IP6_MAX_MTU); + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); + + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static struct dst_entry *icmp6_dst_gc_list; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 644a8da6d4bd..aed872cc05a6 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) return en->labels * sizeof(struct mpls_shim_hdr); } -static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int mpls_xmit(struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; @@ -115,7 +115,7 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); - return 0; + return LWTUNNEL_XMIT_DONE; drop: kfree_skb(skb); @@ -153,7 +153,8 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla, if (ret) goto errout; newts->type = LWTUNNEL_ENCAP_MPLS; - newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + newts->headroom = mpls_encap_size(tun_encap_info); *ts = newts; @@ -209,7 +210,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) static const struct lwtunnel_encap_ops mpls_iptun_ops = { .build_state = mpls_build_state, - .output = mpls_output, + .xmit = mpls_xmit, .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, -- cgit v1.2.3 From 8df3025520aaeba36aba867a4851f8968ac65b4d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 31 Aug 2016 11:50:03 -0400 Subject: net: dsa: add MDB support Add SWITCHDEV_OBJ_ID_PORT_MDB support to the DSA layer. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.txt | 23 +++++++++++++++ include/net/dsa.h | 16 +++++++++++ net/dsa/slave.c | 55 ++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) (limited to 'include') diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt index a4e55c76d371..6d6c07cf1a9a 100644 --- a/Documentation/networking/dsa/dsa.txt +++ b/Documentation/networking/dsa/dsa.txt @@ -584,6 +584,29 @@ of DSA, would be the its port-based VLAN, used by the associated bridge device. function that the driver has to call for each MAC address known to be behind the given port. A switchdev object is used to carry the VID and FDB info. +- port_mdb_prepare: bridge layer function invoked when the bridge prepares the + installation of a multicast database entry. If the operation is not supported, + this function should return -EOPNOTSUPP to inform the bridge code to fallback + to a software implementation. No hardware setup must be done in this function. + See port_fdb_add for this and details. + +- port_mdb_add: bridge layer function invoked when the bridge wants to install + a multicast database entry, the switch hardware should be programmed with the + specified address in the specified VLAN ID in the forwarding database + associated with this VLAN ID. + +Note: VLAN ID 0 corresponds to the port private database, which, in the context +of DSA, would be the its port-based VLAN, used by the associated bridge device. + +- port_mdb_del: bridge layer function invoked when the bridge wants to remove a + multicast database entry, the switch hardware should be programmed to delete + the specified MAC address from the specified VLAN ID if it was mapped into + this port forwarding database. + +- port_mdb_dump: bridge layer function invoked with a switchdev callback + function that the driver has to call for each MAC address known to be behind + the given port. A switchdev object is used to carry the VID and MDB info. + TODO ==== diff --git a/include/net/dsa.h b/include/net/dsa.h index 2ebeba44a461..e3eb230b970d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -234,6 +234,7 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) struct switchdev_trans; struct switchdev_obj; struct switchdev_obj_port_fdb; +struct switchdev_obj_port_mdb; struct switchdev_obj_port_vlan; struct dsa_switch_ops { @@ -369,6 +370,21 @@ struct dsa_switch_ops { int (*port_fdb_dump)(struct dsa_switch *ds, int port, struct switchdev_obj_port_fdb *fdb, int (*cb)(struct switchdev_obj *obj)); + + /* + * Multicast database + */ + int (*port_mdb_prepare)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans); + void (*port_mdb_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans); + int (*port_mdb_del)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb); + int (*port_mdb_dump)(struct dsa_switch *ds, int port, + struct switchdev_obj_port_mdb *mdb, + int (*cb)(struct switchdev_obj *obj)); }; void register_switch_driver(struct dsa_switch_ops *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9f6c2a20f6ff..9ecbe787f102 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -290,6 +290,50 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, return -EOPNOTSUPP; } +static int dsa_slave_port_mdb_add(struct net_device *dev, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans); + } + + ds->ops->port_mdb_add(ds, p->port, mdb, trans); + + return 0; +} + +static int dsa_slave_port_mdb_del(struct net_device *dev, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->ops->port_mdb_del) + return ds->ops->port_mdb_del(ds, p->port, mdb); + + return -EOPNOTSUPP; +} + +static int dsa_slave_port_mdb_dump(struct net_device *dev, + struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->ops->port_mdb_dump) + return ds->ops->port_mdb_dump(ds, p->port, mdb, cb); + + return -EOPNOTSUPP; +} + static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -412,6 +456,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev, SWITCHDEV_OBJ_PORT_FDB(obj), trans); break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj), + trans); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_add(dev, SWITCHDEV_OBJ_PORT_VLAN(obj), @@ -435,6 +483,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_slave_port_fdb_del(dev, SWITCHDEV_OBJ_PORT_FDB(obj)); break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_del(dev, SWITCHDEV_OBJ_PORT_VLAN(obj)); @@ -459,6 +510,10 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, SWITCHDEV_OBJ_PORT_FDB(obj), cb); break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj), + cb); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_dump(dev, SWITCHDEV_OBJ_PORT_VLAN(obj), -- cgit v1.2.3 From d001648ec7cf8b21ae9eec8b9ba4a18295adfb14 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Aug 2016 20:42:14 +0100 Subject: rxrpc: Don't expose skbs to in-kernel users [ver #2] Don't expose skbs to in-kernel users, such as the AFS filesystem, but instead provide a notification hook the indicates that a call needs attention and another that indicates that there's a new call to be collected. This makes the following possibilities more achievable: (1) Call refcounting can be made simpler if skbs don't hold refs to calls. (2) skbs referring to non-data events will be able to be freed much sooner rather than being queued for AFS to pick up as rxrpc_kernel_recv_data will be able to consult the call state. (3) We can shortcut the receive phase when a call is remotely aborted because we don't have to go through all the packets to get to the one cancelling the operation. (4) It makes it easier to do encryption/decryption directly between AFS's buffers and sk_buffs. (5) Encryption/decryption can more easily be done in the AFS's thread contexts - usually that of the userspace process that issued a syscall - rather than in one of rxrpc's background threads on a workqueue. (6) AFS will be able to wait synchronously on a call inside AF_RXRPC. To make this work, the following interface function has been added: int rxrpc_kernel_recv_data( struct socket *sock, struct rxrpc_call *call, void *buffer, size_t bufsize, size_t *_offset, bool want_more, u32 *_abort_code); This is the recvmsg equivalent. It allows the caller to find out about the state of a specific call and to transfer received data into a buffer piecemeal. afs_extract_data() and rxrpc_kernel_recv_data() now do all the extraction logic between them. They don't wait synchronously yet because the socket lock needs to be dealt with. Five interface functions have been removed: rxrpc_kernel_is_data_last() rxrpc_kernel_get_abort_code() rxrpc_kernel_get_error_number() rxrpc_kernel_free_skb() rxrpc_kernel_data_consumed() As a temporary hack, sk_buffs going to an in-kernel call are queued on the rxrpc_call struct (->knlrecv_queue) rather than being handed over to the in-kernel user. To process the queue internally, a temporary function, temp_deliver_data() has been added. This will be replaced with common code between the rxrpc_recvmsg() path and the kernel_rxrpc_recv_data() path in a future patch. Signed-off-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.txt | 72 +++--- fs/afs/cmservice.c | 142 ++++++------ fs/afs/fsclient.c | 148 ++++++------- fs/afs/internal.h | 33 +-- fs/afs/rxrpc.c | 439 +++++++++++++------------------------ fs/afs/vlclient.c | 7 +- include/net/af_rxrpc.h | 35 +-- net/rxrpc/af_rxrpc.c | 29 +-- net/rxrpc/ar-internal.h | 23 +- net/rxrpc/call_accept.c | 13 +- net/rxrpc/call_object.c | 5 +- net/rxrpc/conn_event.c | 1 - net/rxrpc/input.c | 10 +- net/rxrpc/output.c | 2 +- net/rxrpc/recvmsg.c | 191 +++++++++++++--- net/rxrpc/skbuff.c | 1 - 16 files changed, 565 insertions(+), 586 deletions(-) (limited to 'include') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index cfc8cb91452f..1b63bbc6b94f 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -748,6 +748,37 @@ The kernel interface functions are as follows: The msg must not specify a destination address, control data or any flags other than MSG_MORE. len is the total amount of data to transmit. + (*) Receive data from a call. + + int rxrpc_kernel_recv_data(struct socket *sock, + struct rxrpc_call *call, + void *buf, + size_t size, + size_t *_offset, + bool want_more, + u32 *_abort) + + This is used to receive data from either the reply part of a client call + or the request part of a service call. buf and size specify how much + data is desired and where to store it. *_offset is added on to buf and + subtracted from size internally; the amount copied into the buffer is + added to *_offset before returning. + + want_more should be true if further data will be required after this is + satisfied and false if this is the last item of the receive phase. + + There are three normal returns: 0 if the buffer was filled and want_more + was true; 1 if the buffer was filled, the last DATA packet has been + emptied and want_more was false; and -EAGAIN if the function needs to be + called again. + + If the last DATA packet is processed but the buffer contains less than + the amount requested, EBADMSG is returned. If want_more wasn't set, but + more data was available, EMSGSIZE is returned. + + If a remote ABORT is detected, the abort code received will be stored in + *_abort and ECONNABORTED will be returned. + (*) Abort a call. void rxrpc_kernel_abort_call(struct socket *sock, @@ -825,47 +856,6 @@ The kernel interface functions are as follows: Other errors may be returned if the call had been aborted (-ECONNABORTED) or had timed out (-ETIME). - (*) Record the delivery of a data message. - - void rxrpc_kernel_data_consumed(struct rxrpc_call *call, - struct sk_buff *skb); - - This is used to record a data message as having been consumed and to - update the ACK state for the call. The message must still be passed to - rxrpc_kernel_free_skb() for disposal by the caller. - - (*) Free a message. - - void rxrpc_kernel_free_skb(struct sk_buff *skb); - - This is used to free a non-DATA socket buffer intercepted from an AF_RXRPC - socket. - - (*) Determine if a data message is the last one on a call. - - bool rxrpc_kernel_is_data_last(struct sk_buff *skb); - - This is used to determine if a socket buffer holds the last data message - to be received for a call (true will be returned if it does, false - if not). - - The data message will be part of the reply on a client call and the - request on an incoming call. In the latter case there will be more - messages, but in the former case there will not. - - (*) Get the abort code from an abort message. - - u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb); - - This is used to extract the abort code from a remote abort message. - - (*) Get the error number from a local or network error message. - - int rxrpc_kernel_get_error_number(struct sk_buff *skb); - - This is used to extract the error number from a message indicating either - a local error occurred or a network error occurred. - (*) Allocate a null key for doing anonymous security. struct key *rxrpc_get_null_key(const char *keyname); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 77ee481059ac..2037e7a77a37 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -17,15 +17,12 @@ #include "internal.h" #include "afs_cm.h" -static int afs_deliver_cb_init_call_back_state(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_init_call_back_state3(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, - struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state(struct afs_call *); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *); +static int afs_deliver_cb_probe(struct afs_call *); +static int afs_deliver_cb_callback(struct afs_call *); +static int afs_deliver_cb_probe_uuid(struct afs_call *); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); /* @@ -130,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call) * received. The step number here must match the final number in * afs_deliver_cb_callback(). */ - if (call->unmarshall == 6) { + if (call->unmarshall == 5) { ASSERT(call->server && call->count && call->request); afs_break_callbacks(call->server, call->count, call->request); } @@ -164,8 +161,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work) /* * deliver request data to a CB.CallBack call */ -static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_callback(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_callback *cb; @@ -174,7 +170,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, u32 tmp; int ret, loop; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -185,7 +181,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -202,8 +198,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 2: _debug("extract FID array"); - ret = afs_extract_data(call, skb, last, call->buffer, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, true); if (ret < 0) return ret; @@ -229,7 +225,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -239,13 +235,11 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, return -EBADMSG; call->offset = 0; call->unmarshall++; - if (tmp == 0) - goto empty_cb_array; case 4: _debug("extract CB array"); - ret = afs_extract_data(call, skb, last, call->request, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, false); if (ret < 0) return ret; @@ -258,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, cb->type = ntohl(*bp++); } - empty_cb_array: call->offset = 0; call->unmarshall++; - case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking * work, even if the final ACK isn't received. @@ -275,7 +263,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, * updated also. */ call->unmarshall++; - case 6: + case 5: break; } @@ -310,19 +298,17 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) /* * deliver request data to a CB.InitCallBackState call */ -static int afs_deliver_cb_init_call_back_state(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -344,21 +330,61 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* * deliver request data to a CB.InitCallBackState3 call */ -static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); - /* There are some arguments that we ignore */ - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; + + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + break; + } /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -390,14 +416,13 @@ static void SRXAFSCB_Probe(struct work_struct *work) /* * deliver request data to a CB.Probe call */ -static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -435,19 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) /* * deliver request data to a CB.ProbeUuid call */ -static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe_uuid(struct afs_call *call) { struct afs_uuid *r; unsigned loop; __be32 *b; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -459,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract UUID"); - ret = afs_extract_data(call, skb, last, call->buffer, - 11 * sizeof(__be32)); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -487,16 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 2: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); @@ -570,14 +583,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) /* * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 9312b92e54be..96f4d764d1a6 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -235,16 +235,15 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -307,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, /* * deliver reply data to an FS.FetchData */ -static int afs_deliver_fs_fetch_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; @@ -316,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, void *buffer; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -332,7 +330,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, * client) */ case 1: _debug("extract data length (MSW)"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -347,7 +345,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the returned data length */ case 2: _debug("extract data length"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -363,10 +361,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page); - ret = afs_extract_data(call, skb, last, buffer, - call->count); - kunmap_atomic(buffer); + buffer = kmap(page); + ret = afs_extract_data(call, buffer, + call->count, true); + kunmap(buffer); if (ret < 0) return ret; } @@ -376,8 +374,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the metadata */ case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - (21 + 3 + 6) * 4); + ret = afs_extract_data(call, call->buffer, + (21 + 3 + 6) * 4, false); if (ret < 0) return ret; @@ -391,18 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page); + buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer); + kunmap(buffer); } _leave(" = 0 [done]"); @@ -515,13 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server, /* * deliver reply data to an FS.GiveUpCallBacks */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + _enter(""); /* shouldn't be any reply data */ - return afs_data_complete(call, skb, last); + return afs_extract_data(call, NULL, 0, false); } /* @@ -599,16 +593,15 @@ int afs_fs_give_up_callbacks(struct afs_server *server, /* * deliver reply data to an FS.CreateFile or an FS.MakeDir */ -static int afs_deliver_fs_create_vnode(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_create_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -696,16 +689,15 @@ int afs_fs_create(struct afs_server *server, /* * deliver reply data to an FS.RemoveFile or FS.RemoveDir */ -static int afs_deliver_fs_remove(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_remove(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -777,16 +769,15 @@ int afs_fs_remove(struct afs_server *server, /* * deliver reply data to an FS.Link */ -static int afs_deliver_fs_link(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_link(struct afs_call *call) { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -863,16 +854,15 @@ int afs_fs_link(struct afs_server *server, /* * deliver reply data to an FS.Symlink */ -static int afs_deliver_fs_symlink(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_symlink(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -968,16 +958,15 @@ int afs_fs_symlink(struct afs_server *server, /* * deliver reply data to an FS.Rename */ -static int afs_deliver_fs_rename(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_rename(struct afs_call *call) { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1072,16 +1061,15 @@ int afs_fs_rename(struct afs_server *server, /* * deliver reply data to an FS.StoreData */ -static int afs_deliver_fs_store_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1251,17 +1239,16 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, /* * deliver reply data to an FS.StoreStatus */ -static int afs_deliver_fs_store_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_status(struct afs_call *call) { afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1443,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, /* * deliver reply data to an FS.GetVolumeStatus */ -static int afs_deliver_fs_get_volume_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -1460,8 +1446,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, skb, last, call->buffer, - 12 * 4); + ret = afs_extract_data(call, call->buffer, + 12 * 4, true); if (ret < 0) return ret; @@ -1472,7 +1458,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1487,8 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 3: _debug("extract volname"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1508,8 +1494,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1519,7 +1505,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1534,8 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 6: _debug("extract offline"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1555,8 +1541,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 7: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1566,7 +1552,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1581,8 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 9: _debug("extract motd"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1595,26 +1581,17 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->unmarshall++; /* extract the message of the day padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_motd_padding; - } - call->count = 4 - (call->count & 3); + call->count = (4 - (call->count & 3)) & 3; case 10: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, false); if (ret < 0) return ret; call->offset = 0; call->unmarshall++; - no_motd_padding: - case 11: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } @@ -1685,15 +1662,14 @@ int afs_fs_get_volume_status(struct afs_server *server, /* * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock */ -static int afs_deliver_fs_xxxx_lock(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index d97552de9c59..5497c8496055 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -57,7 +56,7 @@ struct afs_mount_params { */ struct afs_wait_mode { /* RxRPC received message notification */ - void (*rx_wakeup)(struct afs_call *call); + rxrpc_notify_rx_t notify_rx; /* synchronous call waiter and call dispatched notification */ int (*wait)(struct afs_call *call); @@ -76,10 +75,8 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ - struct sk_buff_head rx_queue; /* received packets */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ struct afs_server *server; /* server affected by incoming CM call */ @@ -93,6 +90,7 @@ struct afs_call { void *reply4; /* reply buffer (fourth part) */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ + size_t offset; /* offset into received data store */ enum { /* call state */ AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ @@ -100,21 +98,18 @@ struct afs_call { AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ AFS_CALL_REPLYING, /* replying to incoming call */ AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* successfully completed */ - AFS_CALL_BUSY, /* server was busy */ - AFS_CALL_ABORTED, /* call was aborted */ - AFS_CALL_ERROR, /* call failed due to error */ + AFS_CALL_COMPLETE, /* Completed or failed */ } state; int error; /* error code */ + u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ + bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ __be32 operation_ID; /* operation ID for an incoming call */ @@ -129,8 +124,7 @@ struct afs_call_type { /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ - int (*deliver)(struct afs_call *call, struct sk_buff *skb, - bool last); + int (*deliver)(struct afs_call *call); /* map an abort code to an error number */ int (*abort_to_error)(u32 abort_code); @@ -612,27 +606,18 @@ extern struct socket *afs_socket; extern int afs_open_socket(void); extern void afs_close_socket(void); -extern void afs_data_consumed(struct afs_call *, struct sk_buff *); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, const struct afs_wait_mode *); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, - size_t); +extern int afs_extract_data(struct afs_call *, void *, size_t, bool); -static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb, - bool last) +static inline int afs_transfer_reply(struct afs_call *call) { - if (skb->len > 0) - return -EBADMSG; - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - return 0; + return afs_extract_data(call, call->buffer, call->reply_max, false); } /* diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 7b0d18900f50..244896baf241 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -19,31 +19,31 @@ struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; static atomic_t afs_outstanding_calls; -static atomic_t afs_outstanding_skbs; -static void afs_wake_up_call_waiter(struct afs_call *); +static void afs_free_call(struct afs_call *); +static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static int afs_wait_for_call_to_complete(struct afs_call *); -static void afs_wake_up_async_call(struct afs_call *); +static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct afs_call *); -static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); -static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); +static void afs_process_async_call(struct work_struct *); +static void afs_rx_new_call(struct sock *); +static int afs_deliver_cm_op_id(struct afs_call *); /* synchronous call management */ const struct afs_wait_mode afs_sync_call = { - .rx_wakeup = afs_wake_up_call_waiter, + .notify_rx = afs_wake_up_call_waiter, .wait = afs_wait_for_call_to_complete, }; /* asynchronous call management */ const struct afs_wait_mode afs_async_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, .wait = afs_dont_wait_for_call_to_complete, }; /* asynchronous incoming call management */ static const struct afs_wait_mode afs_async_incoming_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, }; /* asynchronous incoming call initial processing */ @@ -55,16 +55,8 @@ static const struct afs_call_type afs_RXCMxxxx = { static void afs_collect_incoming_call(struct work_struct *); -static struct sk_buff_head afs_incoming_calls; static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(call); -} - static int afs_wait_atomic_t(atomic_t *p) { schedule(); @@ -83,8 +75,6 @@ int afs_open_socket(void) _enter(""); - skb_queue_head_init(&afs_incoming_calls); - ret = -ENOMEM; afs_async_calls = create_singlethread_workqueue("kafsd"); if (!afs_async_calls) @@ -110,12 +100,12 @@ int afs_open_socket(void) if (ret < 0) goto error_2; + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call); + ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; - rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); - afs_socket = socket; _leave(" = 0"); return 0; @@ -136,51 +126,19 @@ void afs_close_socket(void) { _enter(""); + _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); + flush_workqueue(afs_async_calls); sock_release(afs_socket); _debug("dework"); destroy_workqueue(afs_async_calls); - - ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); _leave(""); } -/* - * Note that the data in a socket buffer is now consumed. - */ -void afs_data_consumed(struct afs_call *call, struct sk_buff *skb) -{ - if (!skb) { - _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("DLVR %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - rxrpc_kernel_data_consumed(call->rxcall, skb); - } -} - -/* - * free a socket buffer - */ -static void afs_free_skb(struct sk_buff *skb) -{ - if (!skb) { - _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("FREE %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_free_skb(skb); - } -} - /* * free a call */ @@ -191,7 +149,6 @@ static void afs_free_call(struct afs_call *call) ASSERTCMP(call->rxcall, ==, NULL); ASSERT(!work_pending(&call->async_work)); - ASSERT(skb_queue_empty(&call->rx_queue)); ASSERT(call->type->name != NULL); kfree(call->request); @@ -227,7 +184,7 @@ static void afs_end_call(struct afs_call *call) * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, - size_t request_size, size_t reply_size) + size_t request_size, size_t reply_max) { struct afs_call *call; @@ -241,7 +198,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, call->type = type; call->request_size = request_size; - call->reply_max = reply_size; + call->reply_max = reply_max; if (request_size) { call->request = kmalloc(request_size, GFP_NOFS); @@ -249,14 +206,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, goto nomem_free; } - if (reply_size) { - call->buffer = kmalloc(reply_size, GFP_NOFS); + if (reply_max) { + call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); return call; nomem_free: @@ -354,7 +310,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; - struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -366,8 +321,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -380,7 +334,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp); + (unsigned long) call, gfp, + wait_mode->notify_rx); call->key = NULL; if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); @@ -423,150 +378,84 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, error_do_abort: rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); error_kill_call: afs_end_call(call); _leave(" = %d", ret); return ret; } -/* - * Handles intercepted messages that were arriving in the socket's Rx queue. - * - * Called from the AF_RXRPC call processor in waitqueue process context. For - * each call, it is guaranteed this will be called in order of packet to be - * delivered. - */ -static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, - struct sk_buff *skb) -{ - struct afs_call *call = (struct afs_call *) user_call_ID; - - _enter("%p,,%u", call, skb->mark); - - _debug("ICPT %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - - ASSERTCMP(sk, ==, afs_socket->sk); - atomic_inc(&afs_outstanding_skbs); - - if (!call) { - /* its an incoming call for our callback service */ - skb_queue_tail(&afs_incoming_calls, skb); - queue_work(afs_wq, &afs_collect_incoming_call_work); - } else { - /* route the messages directly to the appropriate call */ - skb_queue_tail(&call->rx_queue, skb); - call->wait_mode->rx_wakeup(call); - } - - _leave(""); -} - /* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { - struct sk_buff *skb; - bool last; u32 abort_code; int ret; - _enter(""); - - while ((call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK) && - (skb = skb_dequeue(&call->rx_queue))) { - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - _debug("Rcv DATA"); - last = rxrpc_kernel_is_data_last(skb); - ret = call->type->deliver(call, skb, last); - switch (ret) { - case -EAGAIN: - if (last) { - _debug("short data"); - goto unmarshal_error; - } - break; - case 0: - ASSERT(last); - if (call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; - break; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - goto do_abort; - case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; - goto do_abort; - default: - unmarshal_error: - abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) - abort_code = RXGEN_SS_UNMARSHAL; - do_abort: - rxrpc_kernel_abort_call(afs_socket, - call->rxcall, - abort_code); - call->error = ret; - call->state = AFS_CALL_ERROR; - break; + _enter("%s", call->type->name); + + while (call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK + ) { + if (call->state == AFS_CALL_AWAIT_ACK) { + size_t offset = 0; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + NULL, 0, &offset, false, + &call->abort_code); + if (ret == -EINPROGRESS || ret == -EAGAIN) + return; + if (ret == 1) { + call->state = AFS_CALL_COMPLETE; + goto done; } - break; - case RXRPC_SKB_MARK_FINAL_ACK: - _debug("Rcv ACK"); - call->state = AFS_CALL_COMPLETE; - break; - case RXRPC_SKB_MARK_BUSY: - _debug("Rcv BUSY"); - call->error = -EBUSY; - call->state = AFS_CALL_BUSY; - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Rcv ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Loc ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_NET_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv NET ERROR %d", call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv LOCAL ERROR %d", call->error); - break; - default: - BUG(); - break; + return; } - afs_free_skb(skb); - } - - /* make sure the queue is empty if the call is done with (we might have - * aborted the call early because of an unmarshalling error) */ - if (call->state >= AFS_CALL_COMPLETE) { - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - if (call->incoming) - afs_end_call(call); + ret = call->type->deliver(call); + switch (ret) { + case 0: + if (call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + goto done; + case -EINPROGRESS: + case -EAGAIN: + goto out; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code); + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code); + goto do_abort; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code); + goto do_abort; + } } +done: + if (call->state == AFS_CALL_COMPLETE && call->incoming) + afs_end_call(call); +out: _leave(""); + return; + +do_abort: + call->error = ret; + call->state = AFS_CALL_COMPLETE; + goto done; } /* @@ -574,7 +463,6 @@ static void afs_deliver_to_call(struct afs_call *call) */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - struct sk_buff *skb; int ret; DECLARE_WAITQUEUE(myself, current); @@ -586,14 +474,15 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) set_current_state(TASK_INTERRUPTIBLE); /* deliver any messages that are in the queue */ - if (!skb_queue_empty(&call->rx_queue)) { + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } ret = call->error; - if (call->state >= AFS_CALL_COMPLETE) + if (call->state == AFS_CALL_COMPLETE) break; ret = -EINTR; if (signal_pending(current)) @@ -607,9 +496,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* kill the call */ if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); - rxrpc_kernel_abort_call(afs_socket, call->rxcall, RX_CALL_DEAD); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_CALL_DEAD); } _debug("call complete"); @@ -621,17 +509,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* * wake up a waiting call */ -static void afs_wake_up_call_waiter(struct afs_call *call) +static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; wake_up(&call->waitq); } /* * wake up an asynchronous call */ -static void afs_wake_up_async_call(struct afs_call *call) +static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - _enter(""); + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; queue_work(afs_async_calls, &call->async_work); } @@ -649,8 +544,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct afs_call *call) +static void afs_delete_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); afs_free_call(call); @@ -660,17 +557,19 @@ static void afs_delete_async_call(struct afs_call *call) /* * perform processing on an asynchronous call - * - on a multiple-thread workqueue this work item may try to run on several - * CPUs at the same time */ -static void afs_process_async_call(struct afs_call *call) +static void afs_process_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); - if (!skb_queue_empty(&call->rx_queue)) + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; afs_deliver_to_call(call); + } - if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->state == AFS_CALL_COMPLETE && call->wait_mode) { if (call->wait_mode->async_complete) call->wait_mode->async_complete(call->reply, call->error); @@ -681,45 +580,13 @@ static void afs_process_async_call(struct afs_call *call) /* we can't just delete the call because the work item may be * queued */ - call->async_workfn = afs_delete_async_call; + call->async_work.func = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } _leave(""); } -/* - * Empty a socket buffer into a flat reply buffer. - */ -int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last) -{ - size_t len = skb->len; - - if (len > call->reply_max - call->reply_size) { - _leave(" = -EBADMSG [%zu > %u]", - len, call->reply_max - call->reply_size); - return -EBADMSG; - } - - if (len > 0) { - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, - len) < 0) - BUG(); - call->reply_size += len; - } - - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } - return 0; -} - /* * accept the backlog of incoming calls */ @@ -727,14 +594,10 @@ static void afs_collect_incoming_call(struct work_struct *work) { struct rxrpc_call *rxcall; struct afs_call *call = NULL; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&afs_incoming_calls))) { - _debug("new call"); - /* don't need the notification */ - afs_free_skb(skb); + _enter(""); + do { if (!call) { call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); if (!call) { @@ -742,12 +605,10 @@ static void afs_collect_incoming_call(struct work_struct *work) return; } - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); call->state = AFS_CALL_AWAIT_OP_ID; _debug("CALL %p{%s} [%d]", @@ -757,46 +618,47 @@ static void afs_collect_incoming_call(struct work_struct *work) } rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long) call); + (unsigned long)call, + afs_wake_up_async_call); if (!IS_ERR(rxcall)) { call->rxcall = rxcall; + call->need_attention = true; + queue_work(afs_async_calls, &call->async_work); call = NULL; } - } + } while (!call); if (call) afs_free_call(call); } +/* + * Notification of an incoming call. + */ +static void afs_rx_new_call(struct sock *sk) +{ + queue_work(afs_wq, &afs_collect_incoming_call_work); +} + /* * Grab the operation ID from an incoming cache manager call. The socket * buffer is discarded on error or if we don't yet have sufficient data. */ -static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cm_op_id(struct afs_call *call) { - size_t len = skb->len; - void *oibuf = (void *) &call->operation_ID; + int ret; - _enter("{%u},{%zu},%d", call->offset, len, last); + _enter("{%zu}", call->offset); ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - len = min_t(size_t, len, 4 - call->offset); - if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) - BUG(); - if (!pskb_pull(skb, len)) - BUG(); - call->offset += len; - - if (call->offset < 4) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; - } + ret = afs_extract_data(call, &call->operation_ID, 4, true); + if (ret < 0) + return ret; call->state = AFS_CALL_AWAIT_REQUEST; + call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -805,7 +667,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, /* pass responsibility for the remainer of this message off to the * cache manager op */ - return call->type->deliver(call, skb, last); + return call->type->deliver(call); } /* @@ -881,25 +743,40 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) /* * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, struct sk_buff *skb, - bool last, void *buf, size_t count) +int afs_extract_data(struct afs_call *call, void *buf, size_t count, + bool want_more) { - size_t len = skb->len; + int ret; - _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + _enter("{%s,%zu},,%zu,%d", + call->type->name, call->offset, count, want_more); - ASSERTCMP(call->offset, <, count); + ASSERTCMP(call->offset, <=, count); - len = min_t(size_t, len, count - call->offset); - if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || - !pskb_pull(skb, len)) - BUG(); - call->offset += len; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + buf, count, &call->offset, + want_more, &call->abort_code); + if (ret == 0 || ret == -EAGAIN) + return ret; - if (call->offset < count) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; + if (ret == 1) { + switch (call->state) { + case AFS_CALL_AWAIT_REPLY: + call->state = AFS_CALL_COMPLETE; + break; + case AFS_CALL_AWAIT_REQUEST: + call->state = AFS_CALL_REPLYING; + break; + default: + break; + } + return 0; } - return 0; + + if (ret == -ECONNABORTED) + call->error = call->type->abort_to_error(call->abort_code); + else + call->error = ret; + call->state = AFS_CALL_COMPLETE; + return ret; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index f94d1abdc3eb..94bcd97d22b8 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -58,17 +58,16 @@ static int afs_vl_abort_to_error(u32 abort_code) /* * deliver reply data to a VL.GetEntryByXXX call */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) { struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; int loop, ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f8d8079dc058..b4b6a3664dda 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -12,7 +12,6 @@ #ifndef _NET_RXRPC_H #define _NET_RXRPC_H -#include #include struct key; @@ -20,38 +19,26 @@ struct sock; struct socket; struct rxrpc_call; -/* - * the mark applied to socket buffers that may be intercepted - */ -enum rxrpc_skb_mark { - RXRPC_SKB_MARK_DATA, /* data message */ - RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ - RXRPC_SKB_MARK_BUSY, /* server busy message */ - RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ - RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ - RXRPC_SKB_MARK_NET_ERROR, /* network error message */ - RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ - RXRPC_SKB_MARK_NEW_CALL, /* local error message */ -}; +typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, + unsigned long); +typedef void (*rxrpc_notify_new_call_t)(struct sock *); -typedef void (*rxrpc_interceptor_t)(struct sock *, unsigned long, - struct sk_buff *); -void rxrpc_kernel_intercept_rx_messages(struct socket *, rxrpc_interceptor_t); +void rxrpc_kernel_new_call_notification(struct socket *, + rxrpc_notify_new_call_t); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, struct sockaddr_rxrpc *, struct key *, unsigned long, - gfp_t); + gfp_t, + rxrpc_notify_rx_t); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t); -void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); +int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, + void *, size_t, size_t *, bool, u32 *); void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); -bool rxrpc_kernel_is_data_last(struct sk_buff *); -u32 rxrpc_kernel_get_abort_code(struct sk_buff *); -int rxrpc_kernel_get_error_number(struct sk_buff *); -void rxrpc_kernel_free_skb(struct sk_buff *); -struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long); +struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long, + rxrpc_notify_rx_t); int rxrpc_kernel_reject_call(struct socket *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index e07c91acd904..32d544995dda 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -231,6 +231,8 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @srx: The address of the peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use + * @gfp: The allocation constraints + * @notify_rx: Where to send notifications instead of socket queue * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and @@ -243,7 +245,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct sockaddr_rxrpc *srx, struct key *key, unsigned long user_call_ID, - gfp_t gfp) + gfp_t gfp, + rxrpc_notify_rx_t notify_rx) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -270,6 +273,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); + if (!IS_ERR(call)) + call->notify_rx = notify_rx; release_sock(&rx->sk); _leave(" = %p", call); @@ -289,31 +294,27 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call); + rxrpc_purge_queue(&call->knlrecv_queue); rxrpc_put_call(call); } EXPORT_SYMBOL(rxrpc_kernel_end_call); /** - * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages + * rxrpc_kernel_new_call_notification - Get notifications of new calls * @sock: The socket to intercept received messages on - * @interceptor: The function to pass the messages to + * @notify_new_call: Function to be called when new calls appear * - * Allow a kernel service to intercept messages heading for the Rx queue on an - * RxRPC socket. They get passed to the specified function instead. - * @interceptor should free the socket buffers it is given. @interceptor is - * called with the socket receive queue spinlock held and softirqs disabled - - * this ensures that the messages will be delivered in the right order. + * Allow a kernel service to be given notifications about new calls. */ -void rxrpc_kernel_intercept_rx_messages(struct socket *sock, - rxrpc_interceptor_t interceptor) +void rxrpc_kernel_new_call_notification( + struct socket *sock, + rxrpc_notify_new_call_t notify_new_call) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - _enter(""); - rx->interceptor = interceptor; + rx->notify_new_call = notify_new_call; } - -EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages); +EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); /* * connect an RxRPC socket diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0c320b2b7b43..4e86d248dc5e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -39,6 +39,20 @@ struct rxrpc_crypt { struct rxrpc_connection; +/* + * Mark applied to socket buffers. + */ +enum rxrpc_skb_mark { + RXRPC_SKB_MARK_DATA, /* data message */ + RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ + RXRPC_SKB_MARK_BUSY, /* server busy message */ + RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ + RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ + RXRPC_SKB_MARK_NET_ERROR, /* network error message */ + RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ + RXRPC_SKB_MARK_NEW_CALL, /* local error message */ +}; + /* * sk_state for RxRPC sockets */ @@ -57,7 +71,7 @@ enum { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ + rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ struct rxrpc_local *local; /* local endpoint */ struct list_head listen_link; /* link in the local endpoint's listen list */ struct list_head secureq; /* calls awaiting connection security clearance */ @@ -367,6 +381,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ RXRPC_CALL_IS_SERVICE, /* Call is service call */ RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ + RXRPC_CALL_RX_NO_MORE, /* Don't indicate MSG_MORE from recvmsg() */ }; /* @@ -441,6 +456,7 @@ struct rxrpc_call { struct timer_list resend_timer; /* Tx resend timer */ struct work_struct destroyer; /* call destroyer */ struct work_struct processor; /* packet processor and ACK generator */ + rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ struct list_head chan_wait_link; /* Link in conn->waiting_calls */ struct hlist_node error_link; /* link in error distribution list */ @@ -448,6 +464,7 @@ struct rxrpc_call { struct rb_node sock_node; /* node in socket call tree */ struct sk_buff_head rx_queue; /* received packets */ struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ + struct sk_buff_head knlrecv_queue; /* Queue for kernel_recv [TODO: replace this] */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ @@ -512,7 +529,8 @@ extern struct workqueue_struct *rxrpc_workqueue; * call_accept.c */ void rxrpc_accept_incoming_calls(struct rxrpc_local *); -struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long); +struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long, + rxrpc_notify_rx_t); int rxrpc_reject_call(struct rxrpc_sock *); /* @@ -874,6 +892,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *); /* * skbuff.c */ +void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_packet_destructor(struct sk_buff *); void rxrpc_new_skb(struct sk_buff *); void rxrpc_see_skb(struct sk_buff *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 03af88fe798b..68a439e30df1 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -286,7 +286,8 @@ security_mismatch: * - assign the user call ID to the call at the front of the queue */ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, - unsigned long user_call_ID) + unsigned long user_call_ID, + rxrpc_notify_rx_t notify_rx) { struct rxrpc_call *call; struct rb_node *parent, **pp; @@ -340,6 +341,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, } /* formalise the acceptance */ + call->notify_rx = notify_rx; call->user_call_ID = user_call_ID; rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); @@ -437,17 +439,20 @@ out: * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call * @sock: The socket on which the impending call is waiting * @user_call_ID: The tag to attach to the call + * @notify_rx: Where to send notifications instead of socket queue * * Allow a kernel service to accept an incoming call, assuming the incoming - * call is still valid. + * call is still valid. The caller should immediately trigger their own + * notification as there must be data waiting. */ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, - unsigned long user_call_ID) + unsigned long user_call_ID, + rxrpc_notify_rx_t notify_rx) { struct rxrpc_call *call; _enter(",%lx", user_call_ID); - call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); + call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID, notify_rx); _leave(" = %p", call); return call; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 104ee8b1de06..516d8ea82f02 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -136,6 +136,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) INIT_LIST_HEAD(&call->accept_link); skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); + skb_queue_head_init(&call->knlrecv_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); rwlock_init(&call->state_lock); @@ -552,8 +553,6 @@ void rxrpc_release_call(struct rxrpc_call *call) spin_lock_bh(&call->lock); } spin_unlock_bh(&call->lock); - - ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); } del_timer_sync(&call->resend_timer); @@ -682,6 +681,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); rxrpc_purge_queue(&call->rx_queue); + rxrpc_purge_queue(&call->knlrecv_queue); rxrpc_put_peer(call->peer); kmem_cache_free(rxrpc_call_jar, call); } @@ -737,6 +737,7 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) rxrpc_purge_queue(&call->rx_queue); ASSERT(skb_queue_empty(&call->rx_oos_queue)); + rxrpc_purge_queue(&call->knlrecv_queue); sock_put(&call->socket->sk); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index bc9b05938ff5..9db90f4f768d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -282,7 +282,6 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, case RXRPC_PACKET_TYPE_DATA: case RXRPC_PACKET_TYPE_ACK: rxrpc_conn_retransmit_call(conn, skb); - rxrpc_free_skb(skb); return 0; case RXRPC_PACKET_TYPE_ABORT: diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 86bea9ad6c3d..72f016cfaaf5 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -90,9 +90,15 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, } /* allow interception by a kernel service */ - if (rx->interceptor) { - rx->interceptor(sk, call->user_call_ID, skb); + if (skb->mark == RXRPC_SKB_MARK_NEW_CALL && + rx->notify_new_call) { spin_unlock_bh(&sk->sk_receive_queue.lock); + skb_queue_tail(&call->knlrecv_queue, skb); + rx->notify_new_call(&rx->sk); + } else if (call->notify_rx) { + spin_unlock_bh(&sk->sk_receive_queue.lock); + skb_queue_tail(&call->knlrecv_queue, skb); + call->notify_rx(&rx->sk, call, call->user_call_ID); } else { _net("post skb %p", skb); __skb_queue_tail(&sk->sk_receive_queue, skb); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index b1e708a12151..817ae801e769 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -190,7 +190,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (cmd == RXRPC_CMD_ACCEPT) { if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) return -EINVAL; - call = rxrpc_accept_call(rx, user_call_ID); + call = rxrpc_accept_call(rx, user_call_ID, NULL); if (IS_ERR(call)) return PTR_ERR(call); rxrpc_put_call(call); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index c9b38c7fb448..0ab7b334bab1 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -369,55 +369,178 @@ wait_error: } -/** - * rxrpc_kernel_is_data_last - Determine if data message is last one - * @skb: Message holding data +/* + * Deliver messages to a call. This keeps processing packets until the buffer + * is filled and we find either more DATA (returns 0) or the end of the DATA + * (returns 1). If more packets are required, it returns -EAGAIN. * - * Determine if data message is last one for the parent call. + * TODO: Note that this is hacked in at the moment and will be replaced. */ -bool rxrpc_kernel_is_data_last(struct sk_buff *skb) +static int temp_deliver_data(struct socket *sock, struct rxrpc_call *call, + struct iov_iter *iter, size_t size, + size_t *_offset) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + size_t remain; + int ret, copy; + + _enter("%d", call->debug_id); + +next: + local_bh_disable(); + skb = skb_dequeue(&call->knlrecv_queue); + local_bh_enable(); + if (!skb) { + if (test_bit(RXRPC_CALL_RX_NO_MORE, &call->flags)) + return 1; + _leave(" = -EAGAIN [empty]"); + return -EAGAIN; + } - ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); + sp = rxrpc_skb(skb); + _debug("dequeued %p %u/%zu", skb, sp->offset, size); - return sp->hdr.flags & RXRPC_LAST_PACKET; -} + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + remain = size - *_offset; + if (remain > 0) { + copy = skb->len - sp->offset; + if (copy > remain) + copy = remain; + ret = skb_copy_datagram_iter(skb, sp->offset, iter, + copy); + if (ret < 0) + goto requeue_and_leave; -EXPORT_SYMBOL(rxrpc_kernel_is_data_last); + /* handle piecemeal consumption of data packets */ + sp->offset += copy; + *_offset += copy; + } -/** - * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message - * @skb: Message indicating an abort - * - * Get the abort code from an RxRPC abort message. - */ -u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + if (sp->offset < skb->len) + goto partially_used_skb; + + /* We consumed the whole packet */ + ASSERTCMP(sp->offset, ==, skb->len); + if (sp->hdr.flags & RXRPC_LAST_PACKET) + set_bit(RXRPC_CALL_RX_NO_MORE, &call->flags); + rxrpc_kernel_data_consumed(call, skb); + rxrpc_free_skb(skb); + goto next; - switch (skb->mark) { - case RXRPC_SKB_MARK_REMOTE_ABORT: - case RXRPC_SKB_MARK_LOCAL_ABORT: - return sp->call->abort_code; default: - BUG(); + rxrpc_free_skb(skb); + goto next; } -} -EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); +partially_used_skb: + ASSERTCMP(*_offset, ==, size); + ret = 0; +requeue_and_leave: + skb_queue_head(&call->knlrecv_queue, skb); + return ret; +} /** - * rxrpc_kernel_get_error - Get the error number from an RxRPC error message - * @skb: Message indicating an error + * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info + * @sock: The socket that the call exists on + * @call: The call to send data through + * @buf: The buffer to receive into + * @size: The size of the buffer, including data already read + * @_offset: The running offset into the buffer. + * @want_more: True if more data is expected to be read + * @_abort: Where the abort code is stored if -ECONNABORTED is returned + * + * Allow a kernel service to receive data and pick up information about the + * state of a call. Returns 0 if got what was asked for and there's more + * available, 1 if we got what was asked for and we're at the end of the data + * and -EAGAIN if we need more data. + * + * Note that we may return -EAGAIN to drain empty packets at the end of the + * data, even if we've already copied over the requested data. * - * Get the error number from an RxRPC error message. + * This function adds the amount it transfers to *_offset, so this should be + * precleared as appropriate. Note that the amount remaining in the buffer is + * taken to be size - *_offset. + * + * *_abort should also be initialised to 0. */ -int rxrpc_kernel_get_error_number(struct sk_buff *skb) +int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, + void *buf, size_t size, size_t *_offset, + bool want_more, u32 *_abort) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct iov_iter iter; + struct kvec iov; + int ret; - return sp->error; -} + _enter("{%d,%s},%zu,%d", + call->debug_id, rxrpc_call_states[call->state], size, want_more); + + ASSERTCMP(*_offset, <=, size); + ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); -EXPORT_SYMBOL(rxrpc_kernel_get_error_number); + iov.iov_base = buf + *_offset; + iov.iov_len = size - *_offset; + iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); + + lock_sock(sock->sk); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + ret = temp_deliver_data(sock, call, &iter, size, _offset); + if (ret < 0) + goto out; + + /* We can only reach here with a partially full buffer if we + * have reached the end of the data. We must otherwise have a + * full buffer or have been given -EAGAIN. + */ + if (ret == 1) { + if (*_offset < size) + goto short_data; + if (!want_more) + goto read_phase_complete; + ret = 0; + goto out; + } + + if (!want_more) + goto excess_data; + goto out; + + case RXRPC_CALL_COMPLETE: + goto call_complete; + + default: + *_offset = 0; + ret = -EINPROGRESS; + goto out; + } + +read_phase_complete: + ret = 1; +out: + release_sock(sock->sk); + _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); + return ret; + +short_data: + ret = -EBADMSG; + goto out; +excess_data: + ret = -EMSGSIZE; + goto out; +call_complete: + *_abort = call->abort_code; + ret = call->error; + if (call->completion == RXRPC_CALL_SUCCEEDED) { + ret = 1; + if (size > 0) + ret = -ECONNRESET; + } + goto out; +} +EXPORT_SYMBOL(rxrpc_kernel_recv_data); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 20529205bb8c..9752f8b1fdd0 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -127,7 +127,6 @@ void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) call->rx_data_recv = sp->hdr.seq; rxrpc_hard_ACK_data(call, skb); } -EXPORT_SYMBOL(rxrpc_kernel_data_consumed); /* * Destroy a packet that has an RxRPC control buffer -- cgit v1.2.3 From 66fdd05e7a85564f86d9b220de946aa98e8bc048 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 31 Aug 2016 11:16:22 +0800 Subject: rps: flow_dissector: Add the const for the parameter of flow_keys_have_l4 Add the const for the parameter of flow_keys_have_l4 for the readability. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index f266b512c3bd..d9534927d93b 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -183,7 +183,7 @@ struct flow_keys_digest { void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); -static inline bool flow_keys_have_l4(struct flow_keys *keys) +static inline bool flow_keys_have_l4(const struct flow_keys *keys) { return (keys->ports.ports || keys->tags.flow_label); } -- cgit v1.2.3 From d297653dd6f07afbe7e6c702a4bcd7615680002e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 30 Aug 2016 21:56:45 -0700 Subject: rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu Signed-off-by: Wilson Kok Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 7 +- drivers/net/vxlan.c | 14 ++- include/linux/netdevice.h | 4 +- include/linux/rtnetlink.h | 2 +- include/net/switchdev.h | 4 +- net/bridge/br_fdb.c | 23 ++--- net/bridge/br_private.h | 2 +- net/core/rtnetlink.c | 105 ++++++++++++++--------- net/switchdev/switchdev.c | 10 +-- 9 files changed, 98 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 3ebef27e0964..3ae3968b0edf 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -432,18 +432,19 @@ static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int qlcnic_fdb_dump(struct sk_buff *skb, struct netlink_callback *ncb, struct net_device *netdev, - struct net_device *filter_dev, int idx) + struct net_device *filter_dev, int *idx) { struct qlcnic_adapter *adapter = netdev_priv(netdev); + int err = 0; if (!adapter->fdb_mac_learn) return ndo_dflt_fdb_dump(skb, ncb, netdev, filter_dev, idx); if ((adapter->flags & QLCNIC_ESWITCH_ENABLED) || qlcnic_sriov_check(adapter)) - idx = ndo_dflt_fdb_dump(skb, ncb, netdev, filter_dev, idx); + err = ndo_dflt_fdb_dump(skb, ncb, netdev, filter_dev, idx); - return idx; + return err; } static void qlcnic_82xx_cancel_idc_work(struct qlcnic_adapter *adapter) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3f7e0d2dd21a..f605a3684a7f 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -860,20 +860,20 @@ out: /* Dump forwarding table */ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, - struct net_device *filter_dev, int idx) + struct net_device *filter_dev, int *idx) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; + int err = 0; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct vxlan_fdb *f; - int err; hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; list_for_each_entry_rcu(rd, &f->remotes, list) { - if (idx < cb->args[0]) + if (*idx < cb->args[2]) goto skip; err = vxlan_fdb_info(skb, vxlan, f, @@ -881,17 +881,15 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); - if (err < 0) { - cb->args[1] = err; + if (err < 0) goto out; - } skip: - ++idx; + *idx += 1; } } } out: - return idx; + return err; } /* Watch incoming packets to learn mapping between Ethernet address diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d122be9345c7..67bb978470dc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1031,7 +1031,7 @@ struct netdev_xdp { * Deletes the FDB entry from dev coresponding to addr. * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, - * int idx) + * int *idx) * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * @@ -1263,7 +1263,7 @@ struct net_device_ops { struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx); + int *idx); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 2daece8979f7..57e54847b0b9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -105,7 +105,7 @@ extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx); + int *idx); extern int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 82f5e0462021..6279f2f179ec 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -222,7 +222,7 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], u16 vid); int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, - struct net_device *filter_dev, int idx); + struct net_device *filter_dev, int *idx); void switchdev_port_fwd_mark_set(struct net_device *dev, struct net_device *group_dev, bool joining); @@ -342,7 +342,7 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx) + int *idx) { return idx; } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index cd620fab41b0..6b43c8c88f19 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -710,24 +710,27 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx) + int *idx) { struct net_bridge *br = netdev_priv(dev); + int err = 0; int i; if (!(dev->priv_flags & IFF_EBRIDGE)) goto out; - if (!filter_dev) - idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (!filter_dev) { + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (err < 0) + goto out; + } for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { - int err; - if (idx < cb->args[0]) + if (*idx < cb->args[2]) goto skip; if (filter_dev && @@ -750,17 +753,15 @@ int br_fdb_dump(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI); - if (err < 0) { - cb->args[1] = err; - break; - } + if (err < 0) + goto out; skip: - ++idx; + *idx += 1; } } out: - return idx; + return err; } /* Update (create or replace) forwarding database entry */ diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2379b2b865c9..3d36493f4487 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -508,7 +508,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, struct net_device *fdev, int idx); + struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 318fc5231b2b..1dfca1c3f8f5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3068,7 +3068,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[0]) + if (*idx < cb->args[2]) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -3095,19 +3095,18 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx) + int *idx) { int err; netif_addr_lock_bh(dev); - err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; - nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); + nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); - cb->args[1] = err; - return idx; + return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); @@ -3120,9 +3119,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) const struct net_device_ops *cops = NULL; struct ifinfomsg *ifm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); + struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int idx = 0; + int h, s_h; + int idx = 0, s_idx; + int err = 0; + int fidx = 0; if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy) == 0) { @@ -3140,49 +3143,71 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - cb->args[1] = 0; - for_each_netdev(net, dev) { - if (brport_idx && (dev->ifindex != brport_idx)) - continue; + s_h = cb->args[0]; + s_idx = cb->args[1]; - if (!br_idx) { /* user did not specify a specific bridge */ - if (dev->priv_flags & IFF_BRIDGE_PORT) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { - } else { - if (dev != br_dev && - !(dev->priv_flags & IFF_BRIDGE_PORT)) + if (brport_idx && (dev->ifindex != brport_idx)) continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !(dev->priv_flags & IFF_EBRIDGE)) - continue; + if (!br_idx) { /* user did not specify a specific bridge */ + if (dev->priv_flags & IFF_BRIDGE_PORT) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + } else { + if (dev != br_dev && + !(dev->priv_flags & IFF_BRIDGE_PORT)) + continue; - cops = ops; - } + if (br_dev != netdev_master_upper_dev_get(dev) && + !(dev->priv_flags & IFF_EBRIDGE)) + continue; + cops = ops; + } - if (dev->priv_flags & IFF_BRIDGE_PORT) { - if (cops && cops->ndo_fdb_dump) - idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, - idx); - } - if (cb->args[1] == -EMSGSIZE) - break; + if (idx < s_idx) + goto cont; - if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, - idx); - else - idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); - if (cb->args[1] == -EMSGSIZE) - break; + if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, + br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + goto out; + } + } - cops = NULL; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, + dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, + &fidx); + if (err == -EMSGSIZE) + goto out; + + cops = NULL; + + /* reset fdb offset to 0 for rest of the interfaces */ + cb->args[2] = 0; + fidx = 0; +cont: + idx++; + } } - cb->args[0] = idx; +out: + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = fidx; + return skb->len; } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 1031a0327fff..10b819308439 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1042,7 +1042,7 @@ static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj) struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[0]) + if (dump->idx < dump->cb->args[2]) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, @@ -1089,7 +1089,7 @@ nla_put_failure: */ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, - struct net_device *filter_dev, int idx) + struct net_device *filter_dev, int *idx) { struct switchdev_fdb_dump dump = { .fdb.obj.orig_dev = dev, @@ -1097,14 +1097,14 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .dev = dev, .skb = skb, .cb = cb, - .idx = idx, + .idx = *idx, }; int err; err = switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); - cb->args[1] = err; - return dump.idx; + *idx = dump.idx; + return err; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); -- cgit v1.2.3 From b6cb5ac8331b6bcfe9ce38c7f7f58db6e1d6270a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 31 Aug 2016 15:36:52 +0200 Subject: net: bridge: add per-port multicast flood flag Add a per-port flag to control the unknown multicast flood, similar to the unknown unicast flood flag and break a few long lines in the netlink flag exports. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 3 +++ net/bridge/br_if.c | 2 +- net/bridge/br_netlink.c | 12 +++++++++--- net/bridge/br_sysfs_if.c | 1 + 6 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index dcb89e3515db..c6587c01d951 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -45,6 +45,7 @@ struct br_ip_list { #define BR_PROXYARP BIT(8) #define BR_LEARNING_SYNC BIT(9) #define BR_PROXYARP_WIFI BIT(10) +#define BR_MCAST_FLOOD BIT(11) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a1b5202c5f6b..9bf3aecfe05b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -318,6 +318,7 @@ enum { IFLA_BRPORT_FLUSH, IFLA_BRPORT_MULTICAST_ROUTER, IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 5de854ed3340..7cb41aee4c82 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -186,6 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) continue; + if (pkt_type == BR_PKT_MULTICAST && + !(p->flags & BR_MCAST_FLOOD)) + continue; /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 1da3221845f1..ed0dd3340084 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = BR_LEARNING | BR_FLOOD; + p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 190a5bc00f4a..e99037c6f7b7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -169,10 +169,15 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || - nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || - nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || + nla_put_u8(skb, IFLA_BRPORT_PROTECT, + !!(p->flags & BR_ROOT_BLOCK)) || + nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, + !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || - nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, + !!(p->flags & BR_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, + !!(p->flags & BR_MCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || @@ -630,6 +635,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 1e04d4d44273..e657258e1f2c 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); +BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) -- cgit v1.2.3 From 04bed1434df256b07e78fa5deae848bba18a90f3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 31 Aug 2016 18:06:13 -0400 Subject: net: dsa: remove ds_to_priv Access the priv member of the dsa_switch structure directly, instead of having an unnecessary helper. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 42 +++++++++++------------ drivers/net/dsa/bcm_sf2.h | 2 +- drivers/net/dsa/mv88e6060.c | 4 +-- drivers/net/dsa/mv88e6xxx/chip.c | 72 ++++++++++++++++++++-------------------- include/net/dsa.h | 5 --- 5 files changed, 60 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 1299104a87d4..0afc2e5a04dc 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -477,7 +477,7 @@ static int b53_fast_age_vlan(struct b53_device *dev, u16 vid) static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; unsigned int i; u16 pvlan; @@ -495,7 +495,7 @@ static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) static int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; unsigned int cpu_port = dev->cpu_port; u16 pvlan; @@ -520,7 +520,7 @@ static int b53_enable_port(struct dsa_switch *ds, int port, static void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; u8 reg; /* Disable Tx/Rx for the port */ @@ -629,7 +629,7 @@ static int b53_switch_reset(struct b53_device *dev) static int b53_phy_read16(struct dsa_switch *ds, int addr, int reg) { - struct b53_device *priv = ds_to_priv(ds); + struct b53_device *priv = ds->priv; u16 value = 0; int ret; @@ -644,7 +644,7 @@ static int b53_phy_read16(struct dsa_switch *ds, int addr, int reg) static int b53_phy_write16(struct dsa_switch *ds, int addr, int reg, u16 val) { - struct b53_device *priv = ds_to_priv(ds); + struct b53_device *priv = ds->priv; if (priv->ops->phy_write16) return priv->ops->phy_write16(priv, addr, reg, val); @@ -714,7 +714,7 @@ static unsigned int b53_get_mib_size(struct b53_device *dev) static void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; const struct b53_mib_desc *mibs = b53_get_mib(dev); unsigned int mib_size = b53_get_mib_size(dev); unsigned int i; @@ -727,7 +727,7 @@ static void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data) static void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; const struct b53_mib_desc *mibs = b53_get_mib(dev); unsigned int mib_size = b53_get_mib_size(dev); const struct b53_mib_desc *s; @@ -759,7 +759,7 @@ static void b53_get_ethtool_stats(struct dsa_switch *ds, int port, static int b53_get_sset_count(struct dsa_switch *ds) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; return b53_get_mib_size(dev); } @@ -771,7 +771,7 @@ static int b53_set_addr(struct dsa_switch *ds, u8 *addr) static int b53_setup(struct dsa_switch *ds) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; unsigned int port; int ret; @@ -802,7 +802,7 @@ static int b53_setup(struct dsa_switch *ds) static void b53_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phydev) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; u8 rgmii_ctrl = 0, reg = 0, off; if (!phy_is_pseudo_fixed_link(phydev)) @@ -936,7 +936,7 @@ static int b53_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; if ((is5325(dev) || is5365(dev)) && vlan->vid_begin == 0) return -EOPNOTSUPP; @@ -953,7 +953,7 @@ static void b53_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; unsigned int cpu_port = dev->cpu_port; @@ -987,7 +987,7 @@ static void b53_vlan_add(struct dsa_switch *ds, int port, static int b53_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; unsigned int cpu_port = dev->cpu_port; struct b53_vlan *vl; @@ -1033,7 +1033,7 @@ static int b53_vlan_dump(struct dsa_switch *ds, int port, struct switchdev_obj_port_vlan *vlan, int (*cb)(struct switchdev_obj *obj)) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; u16 vid, vid_start = 0, pvid; struct b53_vlan *vl; int err = 0; @@ -1192,7 +1192,7 @@ static int b53_fdb_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_fdb *fdb, struct switchdev_trans *trans) { - struct b53_device *priv = ds_to_priv(ds); + struct b53_device *priv = ds->priv; /* 5325 and 5365 require some more massaging, but could * be supported eventually @@ -1207,7 +1207,7 @@ static void b53_fdb_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_fdb *fdb, struct switchdev_trans *trans) { - struct b53_device *priv = ds_to_priv(ds); + struct b53_device *priv = ds->priv; if (b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, true)) pr_err("%s: failed to add MAC address\n", __func__); @@ -1216,7 +1216,7 @@ static void b53_fdb_add(struct dsa_switch *ds, int port, static int b53_fdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_fdb *fdb) { - struct b53_device *priv = ds_to_priv(ds); + struct b53_device *priv = ds->priv; return b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, false); } @@ -1275,7 +1275,7 @@ static int b53_fdb_dump(struct dsa_switch *ds, int port, struct switchdev_obj_port_fdb *fdb, int (*cb)(struct switchdev_obj *obj)) { - struct b53_device *priv = ds_to_priv(ds); + struct b53_device *priv = ds->priv; struct net_device *dev = ds->ports[port].netdev; struct b53_arl_entry results[2]; unsigned int count = 0; @@ -1314,7 +1314,7 @@ static int b53_fdb_dump(struct dsa_switch *ds, int port, static int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; s8 cpu_port = ds->dst->cpu_port; u16 pvlan, reg; unsigned int i; @@ -1359,7 +1359,7 @@ static int b53_br_join(struct dsa_switch *ds, int port, static void b53_br_leave(struct dsa_switch *ds, int port) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; struct net_device *bridge = dev->ports[port].bridge_dev; struct b53_vlan *vl = &dev->vlans[0]; s8 cpu_port = ds->dst->cpu_port; @@ -1410,7 +1410,7 @@ static void b53_br_leave(struct dsa_switch *ds, int port) static void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; u8 hw_state, cur_hw_state; u8 reg; diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h index afe56686b3d7..46c4ea796574 100644 --- a/drivers/net/dsa/bcm_sf2.h +++ b/drivers/net/dsa/bcm_sf2.h @@ -101,7 +101,7 @@ struct bcm_sf2_priv { static inline struct bcm_sf2_priv *bcm_sf2_to_priv(struct dsa_switch *ds) { - struct b53_device *dev = ds_to_priv(ds); + struct b53_device *dev = ds->priv; return dev->priv; } diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 7ff9d373a9ee..7ce36dbd9b62 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -19,7 +19,7 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg) { - struct mv88e6060_priv *priv = ds_to_priv(ds); + struct mv88e6060_priv *priv = ds->priv; return mdiobus_read_nested(priv->bus, priv->sw_addr + addr, reg); } @@ -37,7 +37,7 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg) static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) { - struct mv88e6060_priv *priv = ds_to_priv(ds); + struct mv88e6060_priv *priv = ds->priv; return mdiobus_write_nested(priv->bus, priv->sw_addr + addr, reg, val); } diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index e2a953e7f5f5..d6b0f78e9598 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -586,7 +586,7 @@ static bool mv88e6xxx_has_fid_reg(struct mv88e6xxx_chip *chip) static void mv88e6xxx_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phydev) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u32 reg; int ret; @@ -832,7 +832,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip, static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port, uint8_t *data) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_hw_stat *stat; int i, j; @@ -848,7 +848,7 @@ static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port, static int mv88e6xxx_get_sset_count(struct dsa_switch *ds) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_hw_stat *stat; int i, j; @@ -863,7 +863,7 @@ static int mv88e6xxx_get_sset_count(struct dsa_switch *ds) static void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_hw_stat *stat; int ret; int i, j; @@ -894,7 +894,7 @@ static int mv88e6xxx_get_regs_len(struct dsa_switch *ds, int port) static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *_p) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u16 *p = _p; int i; @@ -924,7 +924,7 @@ static int _mv88e6xxx_atu_wait(struct mv88e6xxx_chip *chip) static int mv88e6xxx_get_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u16 reg; int err; @@ -954,7 +954,7 @@ out: static int mv88e6xxx_set_eee(struct dsa_switch *ds, int port, struct phy_device *phydev, struct ethtool_eee *e) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u16 reg; int err; @@ -1185,7 +1185,7 @@ static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int port) static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port, u8 state) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int stp_state; int err; @@ -1434,7 +1434,7 @@ static int mv88e6xxx_port_vlan_dump(struct dsa_switch *ds, int port, struct switchdev_obj_port_vlan *vlan, int (*cb)(struct switchdev_obj *obj)) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_vtu_stu_entry next; u16 pvid; int err; @@ -1803,7 +1803,7 @@ static int _mv88e6xxx_vtu_get(struct mv88e6xxx_chip *chip, u16 vid, static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, u16 vid_begin, u16 vid_end) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_vtu_stu_entry vlan; int i, err; @@ -1864,7 +1864,7 @@ static const char * const mv88e6xxx_port_8021q_mode_names[] = { static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u16 old, new = vlan_filtering ? PORT_CONTROL_2_8021Q_SECURE : PORT_CONTROL_2_8021Q_DISABLED; int ret; @@ -1906,7 +1906,7 @@ mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_VTU)) @@ -1947,7 +1947,7 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; u16 vid; @@ -2009,7 +2009,7 @@ static int _mv88e6xxx_port_vlan_del(struct mv88e6xxx_chip *chip, static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u16 pvid, vid; int err = 0; @@ -2134,7 +2134,7 @@ static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_fdb *fdb, struct switchdev_trans *trans) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; mutex_lock(&chip->reg_lock); if (mv88e6xxx_port_db_load_purge(chip, port, fdb->addr, fdb->vid, @@ -2146,7 +2146,7 @@ static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_fdb *fdb) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -2306,7 +2306,7 @@ static int mv88e6xxx_port_fdb_dump(struct dsa_switch *ds, int port, struct switchdev_obj_port_fdb *fdb, int (*cb)(struct switchdev_obj *obj)) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -2319,7 +2319,7 @@ static int mv88e6xxx_port_fdb_dump(struct dsa_switch *ds, int port, static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, struct net_device *bridge) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int i, err = 0; mutex_lock(&chip->reg_lock); @@ -2342,7 +2342,7 @@ static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; struct net_device *bridge = chip->ports[port].bridge_dev; int i; @@ -2763,7 +2763,7 @@ static int mv88e6xxx_g1_set_age_time(struct mv88e6xxx_chip *chip, static int mv88e6xxx_set_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -3204,7 +3204,7 @@ static int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip) static int mv88e6xxx_setup(struct dsa_switch *ds) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; int i; @@ -3244,7 +3244,7 @@ unlock: static int mv88e6xxx_set_addr(struct dsa_switch *ds, u8 *addr) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -3352,7 +3352,7 @@ static void mv88e6xxx_mdio_unregister(struct mv88e6xxx_chip *chip) static int mv88e61xx_get_temp(struct dsa_switch *ds, int *temp) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; u16 val; int ret; @@ -3395,7 +3395,7 @@ error: static int mv88e63xx_get_temp(struct dsa_switch *ds, int *temp) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int phy = mv88e6xxx_6320_family(chip) ? 3 : 0; u16 val; int ret; @@ -3415,7 +3415,7 @@ static int mv88e63xx_get_temp(struct dsa_switch *ds, int *temp) static int mv88e6xxx_get_temp(struct dsa_switch *ds, int *temp) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_TEMP)) return -EOPNOTSUPP; @@ -3428,7 +3428,7 @@ static int mv88e6xxx_get_temp(struct dsa_switch *ds, int *temp) static int mv88e6xxx_get_temp_limit(struct dsa_switch *ds, int *temp) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int phy = mv88e6xxx_6320_family(chip) ? 3 : 0; u16 val; int ret; @@ -3451,7 +3451,7 @@ static int mv88e6xxx_get_temp_limit(struct dsa_switch *ds, int *temp) static int mv88e6xxx_set_temp_limit(struct dsa_switch *ds, int temp) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int phy = mv88e6xxx_6320_family(chip) ? 3 : 0; u16 val; int err; @@ -3474,7 +3474,7 @@ unlock: static int mv88e6xxx_get_temp_alarm(struct dsa_switch *ds, bool *alarm) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int phy = mv88e6xxx_6320_family(chip) ? 3 : 0; u16 val; int ret; @@ -3498,7 +3498,7 @@ static int mv88e6xxx_get_temp_alarm(struct dsa_switch *ds, bool *alarm) static int mv88e6xxx_get_eeprom_len(struct dsa_switch *ds) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; return chip->eeprom_len; } @@ -3556,7 +3556,7 @@ static int mv88e6xxx_get_eeprom16(struct mv88e6xxx_chip *chip, static int mv88e6xxx_get_eeprom(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -3645,7 +3645,7 @@ static int mv88e6xxx_set_eeprom16(struct mv88e6xxx_chip *chip, static int mv88e6xxx_set_eeprom(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; if (eeprom->magic != 0xc3ec4951) @@ -3953,7 +3953,7 @@ static int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip, static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_EDSA)) return DSA_TAG_PROTO_EDSA; @@ -4018,7 +4018,7 @@ static void mv88e6xxx_port_mdb_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; mutex_lock(&chip->reg_lock); if (mv88e6xxx_port_db_load_purge(chip, port, mdb->addr, mdb->vid, @@ -4030,7 +4030,7 @@ static void mv88e6xxx_port_mdb_add(struct dsa_switch *ds, int port, static int mv88e6xxx_port_mdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -4045,7 +4045,7 @@ static int mv88e6xxx_port_mdb_dump(struct dsa_switch *ds, int port, struct switchdev_obj_port_mdb *mdb, int (*cb)(struct switchdev_obj *obj)) { - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; int err; mutex_lock(&chip->reg_lock); @@ -4173,7 +4173,7 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) static void mv88e6xxx_remove(struct mdio_device *mdiodev) { struct dsa_switch *ds = dev_get_drvdata(&mdiodev->dev); - struct mv88e6xxx_chip *chip = ds_to_priv(ds); + struct mv88e6xxx_chip *chip = ds->priv; mv88e6xxx_phy_destroy(chip); mv88e6xxx_unregister_switch(chip); diff --git a/include/net/dsa.h b/include/net/dsa.h index e3eb230b970d..9d97c5214341 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -391,11 +391,6 @@ void register_switch_driver(struct dsa_switch_ops *type); void unregister_switch_driver(struct dsa_switch_ops *type); struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); -static inline void *ds_to_priv(struct dsa_switch *ds) -{ - return ds->priv; -} - static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { return dst->rcv != NULL; -- cgit v1.2.3 From 0515e5999a466dfe6e1924f460da599bb6821487 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:22 -0700 Subject: bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type Introduce BPF_PROG_TYPE_PERF_EVENT programs that can be attached to HW and SW perf events (PERF_TYPE_HARDWARE and PERF_TYPE_SOFTWARE correspondingly in uapi/linux/perf_event.h) The program visible context meta structure is struct bpf_perf_event_data { struct pt_regs regs; __u64 sample_period; }; which is accessible directly from the program: int bpf_prog(struct bpf_perf_event_data *ctx) { ... ctx->sample_period ... ... ctx->regs.ip ... } The bpf verifier rewrites the accesses into kernel internal struct bpf_perf_event_data_kern which allows changing struct perf_sample_data without affecting bpf programs. New fields can be added to the end of struct bpf_perf_event_data in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/perf_event.h | 5 +++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/bpf_perf_event.h | 18 +++++++++++ kernel/trace/bpf_trace.c | 61 +++++++++++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+) create mode 100644 include/uapi/linux/bpf_perf_event.h (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43cc0dd5..97bfe62f30d7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -788,6 +788,11 @@ struct perf_output_handle { int page; }; +struct bpf_perf_event_data_kern { + struct pt_regs *regs; + struct perf_sample_data *data; +}; + #ifdef CONFIG_CGROUP_PERF /* diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 185f8ea2702f..d0352a971ebd 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -71,6 +71,7 @@ header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h header-y += bpf_common.h +header-y += bpf_perf_event.h header-y += bpf.h header-y += bpqether.h header-y += bsg.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e4c5a1baa993..f896dfac4ac0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h new file mode 100644 index 000000000000..067427259820 --- /dev/null +++ b/include/uapi/linux/bpf_perf_event.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__ +#define _UAPI__LINUX_BPF_PERF_EVENT_H__ + +#include +#include + +struct bpf_perf_event_data { + struct pt_regs regs; + __u64 sample_period; +}; + +#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad35213b8405..d3869b03d9fe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = { .type = BPF_PROG_TYPE_TRACEPOINT, }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { + if (size != sizeof(u64)) + return false; + } else { + if (size != sizeof(long)) + return false; + } + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_perf_event_data, sample_period): + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + offsetof(struct perf_sample_data, period)); + break; + default: + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), + dst_reg, dst_reg, ctx_off); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { + .ops = &perf_event_prog_ops, + .type = BPF_PROG_TYPE_PERF_EVENT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&tracepoint_tl); + bpf_register_prog_type(&perf_event_tl); return 0; } late_initcall(register_kprobe_prog_ops); -- cgit v1.2.3 From aa6a5f3cb2b2edc5b9aab0b4fdfdfa9c3b5096a8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:24 -0700 Subject: perf, bpf: add perf events core support for BPF_PROG_TYPE_PERF_EVENT programs Allow attaching BPF_PROG_TYPE_PERF_EVENT programs to sw and hw perf events via overflow_handler mechanism. When program is attached the overflow_handlers become stacked. The program acts as a filter. Returning zero from the program means that the normal perf_event_output handler will not be called and sampling event won't be stored in the ring buffer. The overflow_handler_context==NULL is an additional safety check to make sure programs are not attached to hw breakpoints and watchdog in case other checks (that prevent that now anyway) get accidentally relaxed in the future. The program refcnt is incremented in case perf_events are inhereted when target task is forked. Similar to kprobe and tracepoint programs there is no ioctl to detach the program or swap already attached program. The user space expected to close(perf_event_fd) like it does right now for kprobe+bpf. That restriction simplifies the code quite a bit. The invocation of overflow_handler in __perf_event_overflow() is now done via READ_ONCE, since that pointer can be replaced when the program is attached while perf_event itself could have been active already. There is no need to do similar treatment for event->prog, since it's assigned only once before it's accessed. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 +++ include/linux/perf_event.h | 4 +++ kernel/events/core.c | 89 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 96 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11134238417d..9a904f63f8c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) static inline void bpf_prog_put(struct bpf_prog *prog) { } +static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 97bfe62f30d7..ccb73a58113d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -679,6 +679,10 @@ struct perf_event { u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + perf_overflow_handler_t orig_overflow_handler; + struct bpf_prog *prog; +#endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..85bf4c37911f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { -- cgit v1.2.3 From dd19bde36739702bbd9a832b5d4995bc0fa8d6d7 Mon Sep 17 00:00:00 2001 From: "Rosen, Rami" Date: Fri, 2 Sep 2016 14:11:57 +0300 Subject: switchdev: Fix return value of switchdev_port_fdb_dump(). This patch fixes the retun value of switchdev_port_fdb_dump() when CONFIG_NET_SWITCHDEV is not set. This avoids getting "warning: return makes integer from pointer without a cast [-Wint-conversion]" when building when CONFIG_NET_SWITCHDEV is not set under several compiler versions. This warning is due to commit d297653dd6f07afbe7e6c702a4bcd7615680002e ("rtnetlink: fdb dump: optimize by saving last interface markers"). Signed-off-by: Rami Rosen Acked-by: Roopa Prabhu Reported-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 6279f2f179ec..729fe1534160 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -344,7 +344,7 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, struct net_device *filter_dev, int *idx) { - return idx; + return *idx; } static inline bool switchdev_port_same_parent_id(struct net_device *a, -- cgit v1.2.3 From 3f37ec79dd21fbdbbab8143a48a87272b22fef22 Mon Sep 17 00:00:00 2001 From: RafaÅ‚ MiÅ‚ecki Date: Mon, 25 Jul 2016 20:33:56 +0200 Subject: bcma: support BCM53573 series of wireless SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM53573 seems to be the first series of Northstar family with wireless on the chip. The base models are BCM53573-s (A0, A1) and there is also BCM47189B0 which seems to be some small modification. The only problem with these chipsets seems to be watchdog. It's totally unavailable on 53573A0 / 53573A1 and preferable PMU watchdog is broken on 53573B0 / 53573B1. Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: Kalle Valo --- drivers/bcma/driver_chipcommon.c | 32 +++++++++++++++++++++++++++++--- include/linux/bcma/bcma.h | 3 +++ 2 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/bcma/driver_chipcommon.c b/drivers/bcma/driver_chipcommon.c index 921ce1834673..b4f6520e74f0 100644 --- a/drivers/bcma/driver_chipcommon.c +++ b/drivers/bcma/driver_chipcommon.c @@ -36,12 +36,31 @@ u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc) } EXPORT_SYMBOL_GPL(bcma_chipco_get_alp_clock); +static bool bcma_core_cc_has_pmu_watchdog(struct bcma_drv_cc *cc) +{ + struct bcma_bus *bus = cc->core->bus; + + if (cc->capabilities & BCMA_CC_CAP_PMU) { + if (bus->chipinfo.id == BCMA_CHIP_ID_BCM53573) { + WARN(bus->chipinfo.rev <= 1, "No watchdog available\n"); + /* 53573B0 and 53573B1 have bugged PMU watchdog. It can + * be enabled but timer can't be bumped. Use CC one + * instead. + */ + return false; + } + return true; + } else { + return false; + } +} + static u32 bcma_chipco_watchdog_get_max_timer(struct bcma_drv_cc *cc) { struct bcma_bus *bus = cc->core->bus; u32 nb; - if (cc->capabilities & BCMA_CC_CAP_PMU) { + if (bcma_core_cc_has_pmu_watchdog(cc)) { if (bus->chipinfo.id == BCMA_CHIP_ID_BCM4706) nb = 32; else if (cc->core->id.rev < 26) @@ -95,9 +114,16 @@ static int bcma_chipco_watchdog_ticks_per_ms(struct bcma_drv_cc *cc) int bcma_chipco_watchdog_register(struct bcma_drv_cc *cc) { + struct bcma_bus *bus = cc->core->bus; struct bcm47xx_wdt wdt = {}; struct platform_device *pdev; + if (bus->chipinfo.id == BCMA_CHIP_ID_BCM53573 && + bus->chipinfo.rev <= 1) { + pr_debug("No watchdog on 53573A0 / 53573A1\n"); + return 0; + } + wdt.driver_data = cc; wdt.timer_set = bcma_chipco_watchdog_timer_set_wdt; wdt.timer_set_ms = bcma_chipco_watchdog_timer_set_ms_wdt; @@ -105,7 +131,7 @@ int bcma_chipco_watchdog_register(struct bcma_drv_cc *cc) bcma_chipco_watchdog_get_max_timer(cc) / cc->ticks_per_ms; pdev = platform_device_register_data(NULL, "bcm47xx-wdt", - cc->core->bus->num, &wdt, + bus->num, &wdt, sizeof(wdt)); if (IS_ERR(pdev)) return PTR_ERR(pdev); @@ -217,7 +243,7 @@ u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks) u32 maxt; maxt = bcma_chipco_watchdog_get_max_timer(cc); - if (cc->capabilities & BCMA_CC_CAP_PMU) { + if (bcma_core_cc_has_pmu_watchdog(cc)) { if (ticks == 1) ticks = 2; else if (ticks > maxt) diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3db25df396cb..8eeedb2db924 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -205,6 +205,9 @@ struct bcma_host_ops { #define BCMA_PKG_ID_BCM4709 0 #define BCMA_CHIP_ID_BCM47094 53030 #define BCMA_CHIP_ID_BCM53018 53018 +#define BCMA_CHIP_ID_BCM53573 53573 +#define BCMA_PKG_ID_BCM53573 0 +#define BCMA_PKG_ID_BCM47189 1 /* Board types (on PCI usually equals to the subsystem dev id) */ /* BCM4313 */ -- cgit v1.2.3 From 5f2d9c44389e7cd9fe192570f6f20199bc861eb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Sep 2016 22:39:45 +0100 Subject: rxrpc: Randomise epoch and starting client conn ID values Create a random epoch value rather than a time-based one on startup and set the top bit to indicate that this is the case. Also create a random starting client connection ID value. This will be incremented from here as new client connections are created. Signed-off-by: David Howells --- include/rxrpc/packet.h | 1 + net/rxrpc/af_rxrpc.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index b2017440b765..3c6128e1fdbe 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -24,6 +24,7 @@ typedef __be32 rxrpc_serial_net_t; /* on-the-wire Rx message serial number */ */ struct rxrpc_wire_header { __be32 epoch; /* client boot timestamp */ +#define RXRPC_RANDOM_EPOCH 0x80000000 /* Random if set, date-based if not */ __be32 cid; /* connection and channel ID */ #define RXRPC_MAXCALLS 4 /* max active calls per conn */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 32d544995dda..b66a9e6f8d04 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -700,7 +701,13 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); - rxrpc_epoch = get_seconds(); + get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); + rxrpc_epoch |= RXRPC_RANDOM_EPOCH; + get_random_bytes(&rxrpc_client_conn_ids.cur, + sizeof(rxrpc_client_conn_ids.cur)); + rxrpc_client_conn_ids.cur &= 0x3fffffff; + if (rxrpc_client_conn_ids.cur == 0) + rxrpc_client_conn_ids.cur = 1; ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( -- cgit v1.2.3 From ecc6569f3503b39f45bc6b86197b5e0a8533fb72 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:11 +0800 Subject: netfilter: gre: Use consistent GRE_* macros instead of ones defined by netfilter. There are already some GRE_* macros in kernel, so it is unnecessary to define these macros. And remove some useless macros Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_proto_gre.h | 22 ++-------------------- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/netfilter/nf_nat_proto_gre.c | 4 ++-- net/netfilter/nf_conntrack_proto_gre.c | 4 ++-- 4 files changed, 7 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index df78dc2b5524..0189747f2691 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -1,29 +1,11 @@ #ifndef _CONNTRACK_PROTO_GRE_H #define _CONNTRACK_PROTO_GRE_H #include +#include +#include /* GRE PROTOCOL HEADER */ -/* GRE Version field */ -#define GRE_VERSION_1701 0x0 -#define GRE_VERSION_PPTP 0x1 - -/* GRE Protocol field */ -#define GRE_PROTOCOL_PPTP 0x880B - -/* GRE Flags */ -#define GRE_FLAG_C 0x80 -#define GRE_FLAG_R 0x40 -#define GRE_FLAG_K 0x20 -#define GRE_FLAG_S 0x10 -#define GRE_FLAG_A 0x80 - -#define GRE_IS_C(f) ((f)&GRE_FLAG_C) -#define GRE_IS_R(f) ((f)&GRE_FLAG_R) -#define GRE_IS_K(f) ((f)&GRE_FLAG_K) -#define GRE_IS_S(f) ((f)&GRE_FLAG_S) -#define GRE_IS_A(f) ((f)&GRE_FLAG_A) - /* GRE is a mess: Four different standards */ struct gre_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 9865c8caedde..fb7337d6b985 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -39,6 +39,7 @@ #define GRE_IS_REC(f) ((f) & GRE_REC) #define GRE_IS_ACK(f) ((f) & GRE_ACK) +#define GRE_VERSION_0 __cpu_to_be16(0x0000) #define GRE_VERSION_1 __cpu_to_be16(0x0001) #define GRE_PROTO_PPP __cpu_to_be16(0x880b) #define GRE_PPTP_KEY_MASK __cpu_to_be32(0xffff) diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 9414923f1e15..93198d71dbb6 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -104,11 +104,11 @@ gre_manip_pkt(struct sk_buff *skb, if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->version) { - case GRE_VERSION_1701: + case ntohs(GRE_VERSION_0): /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; - case GRE_VERSION_PPTP: + case ntohs(GRE_VERSION_1): pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index a96451a7af20..deb239a014e4 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -200,7 +200,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { + if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -212,7 +212,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, if (!pgrehdr) return true; - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { + if (grehdr->protocol != GRE_PROTO_PPP) { pr_debug("GRE_VERSION_PPTP but unknown proto\n"); return false; } -- cgit v1.2.3 From c579a9e7d58f66030a144c7a33cc9bdf827a4b6d Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:47 +0800 Subject: netfilter: gre: Use consistent GRE and PTTP header structure instead of the ones defined by netfilter There are two existing strutures which defines the GRE and PPTP header. So use these two structures instead of the ones defined by netfilter to keep consitent with other codes. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_proto_gre.h | 42 ------------------------ net/ipv4/netfilter/nf_nat_proto_gre.c | 13 ++++---- net/netfilter/nf_conntrack_proto_gre.c | 12 +++---- 3 files changed, 13 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index 0189747f2691..dee0acd0dd31 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -4,48 +4,6 @@ #include #include -/* GRE PROTOCOL HEADER */ - -/* GRE is a mess: Four different standards */ -struct gre_hdr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 rec:3, - srr:1, - seq:1, - key:1, - routing:1, - csum:1, - version:3, - reserved:4, - ack:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u16 csum:1, - routing:1, - key:1, - seq:1, - srr:1, - rec:3, - ack:1, - reserved:4, - version:3; -#else -#error "Adjust your defines" -#endif - __be16 protocol; -}; - -/* modified GRE header for PPTP */ -struct gre_hdr_pptp { - __u8 flags; /* bitfield */ - __u8 version; /* should be GRE_VERSION_PPTP */ - __be16 protocol; /* should be GRE_PROTOCOL_PPTP */ - __be16 payload_len; /* size of ppp payload, not inc. gre header */ - __be16 call_id; /* peer's call_id for this session */ - __be32 seq; /* sequence number. Present if S==1 */ - __be32 ack; /* seq number of highest packet received by */ - /* sender in this session */ -}; - struct nf_ct_gre { unsigned int stream_timeout; unsigned int timeout; diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 93198d71dbb6..edf05002d674 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - const struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; + const struct gre_base_hdr *greh; + struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ @@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb, return false; greh = (void *)skb->data + hdroff; - pgreh = (struct gre_hdr_pptp *)greh; + pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; - switch (greh->version) { - case ntohs(GRE_VERSION_0): + + switch (greh->flags & GRE_VERSION) { + case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; - case ntohs(GRE_VERSION_1): + case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index deb239a014e4..9a715f88b2f1 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct gre_hdr_pptp *pgrehdr; - struct gre_hdr_pptp _pgrehdr; + const struct pptp_gre_header *pgrehdr; + struct pptp_gre_header _pgrehdr; __be16 srckey; - const struct gre_hdr *grehdr; - struct gre_hdr _grehdr; + const struct gre_base_hdr *grehdr; + struct gre_base_hdr _grehdr; /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) { + if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -213,7 +213,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; if (grehdr->protocol != GRE_PROTO_PPP) { - pr_debug("GRE_VERSION_PPTP but unknown proto\n"); + pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol)); return false; } -- cgit v1.2.3 From 0d9932b2875f568d679f2af33ce610da3903ac11 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Fri, 2 Sep 2016 15:05:57 +0200 Subject: netfilter: nft_numgen: rename until attribute by modulus The _until_ attribute is renamed to _modulus_ as the behaviour is similar to other expresions with number limits (ex. nft_hash). Renaming is possible because there isn't a kernel release yet with these changes. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++-- net/netfilter/nft_numgen.c | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 28ce01d79707..24161e25576d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1126,13 +1126,13 @@ enum nft_trace_types { * enum nft_ng_attributes - nf_tables number generator expression netlink attributes * * @NFTA_NG_DREG: destination register (NLA_U32) - * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32) + * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) * @NFTA_NG_TYPE: operation type (NLA_U32) */ enum nft_ng_attributes { NFTA_NG_UNSPEC, NFTA_NG_DREG, - NFTA_NG_UNTIL, + NFTA_NG_MODULUS, NFTA_NG_TYPE, __NFTA_NG_MAX }; diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 294745ecb0fc..f51a3ede3932 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -21,7 +21,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); struct nft_ng_inc { enum nft_registers dreg:8; - u32 until; + u32 modulus; atomic_t counter; }; @@ -34,7 +34,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, do { oval = atomic_read(&priv->counter); - nval = (oval + 1 < priv->until) ? oval + 1 : 0; + nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); memcpy(®s->data[priv->dreg], &priv->counter, sizeof(u32)); @@ -42,7 +42,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, - [NFTA_NG_UNTIL] = { .type = NLA_U32 }, + [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, }; @@ -52,8 +52,8 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, { struct nft_ng_inc *priv = nft_expr_priv(expr); - priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL])); - if (priv->until == 0) + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); + if (priv->modulus == 0) return -ERANGE; priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); @@ -64,11 +64,11 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, - u32 until, enum nft_ng_types type) + u32 modulus, enum nft_ng_types type) { if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_NG_UNTIL, htonl(until))) + if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type))) goto nla_put_failure; @@ -83,12 +83,12 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_inc *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL); } struct nft_ng_random { enum nft_registers dreg:8; - u32 until; + u32 modulus; }; static void nft_ng_random_eval(const struct nft_expr *expr, @@ -99,7 +99,7 @@ static void nft_ng_random_eval(const struct nft_expr *expr, struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state), - priv->until); + priv->modulus); } static int nft_ng_random_init(const struct nft_ctx *ctx, @@ -108,8 +108,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, { struct nft_ng_random *priv = nft_expr_priv(expr); - priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL])); - if (priv->until == 0) + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); + if (priv->modulus == 0) return -ERANGE; prandom_init_once(&nft_numgen_prandom_state); @@ -124,7 +124,7 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM); } static struct nft_expr_type nft_ng_type; @@ -149,8 +149,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { u32 type; - if (!tb[NFTA_NG_DREG] || - !tb[NFTA_NG_UNTIL] || + if (!tb[NFTA_NG_DREG] || + !tb[NFTA_NG_MODULUS] || !tb[NFTA_NG_TYPE]) return ERR_PTR(-EINVAL); -- cgit v1.2.3 From fff72429c2e83bdbe32dc7f1ad6398dfe50750c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Sep 2016 14:34:21 +0100 Subject: rxrpc: Improve the call tracking tracepoint Improve the call tracking tracepoint by showing more differentiation between some of the put and get events, including: (1) Getting and putting refs for the socket call user ID tree. (2) Getting and putting refs for queueing and failing to queue the call processor work item. Note that these aren't necessarily used in this patch, but will be taken advantage of in future patches. An enum is added for the event subtype numbers rather than coding them directly as decimal numbers and a table of 3-letter strings is provided rather than a sequence of ?: operators. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 11 +++------- net/rxrpc/af_rxrpc.c | 2 +- net/rxrpc/ar-internal.h | 22 ++++++++++++++++++-- net/rxrpc/call_accept.c | 10 ++++----- net/rxrpc/call_event.c | 2 +- net/rxrpc/call_object.c | 48 ++++++++++++++++++++++++++++---------------- net/rxrpc/input.c | 6 +++--- net/rxrpc/recvmsg.c | 23 +++++++++++---------- net/rxrpc/sendmsg.c | 4 ++-- net/rxrpc/skbuff.c | 2 +- 10 files changed, 79 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cbe574ea674b..30164896f1f6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -17,7 +17,8 @@ #include TRACE_EVENT(rxrpc_call, - TP_PROTO(struct rxrpc_call *call, int op, int usage, int nskb, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op, + int usage, int nskb, const void *where, const void *aux), TP_ARGS(call, op, usage, nskb, where, aux), @@ -42,13 +43,7 @@ TRACE_EVENT(rxrpc_call, TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p", __entry->call, - (__entry->op == 0 ? "NWc" : - __entry->op == 1 ? "NWs" : - __entry->op == 2 ? "SEE" : - __entry->op == 3 ? "GET" : - __entry->op == 4 ? "Gsb" : - __entry->op == 5 ? "PUT" : - "Psb"), + rxrpc_call_traces[__entry->op], __entry->usage, __entry->nskb, __entry->where, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index b66a9e6f8d04..8356cd003d51 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -296,7 +296,7 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call); rxrpc_purge_queue(&call->knlrecv_queue); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); } EXPORT_SYMBOL(rxrpc_kernel_end_call); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ad702f9f8d1f..913255a53564 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -508,6 +508,24 @@ struct rxrpc_call { unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; }; +enum rxrpc_call_trace { + rxrpc_call_new_client, + rxrpc_call_new_service, + rxrpc_call_queued, + rxrpc_call_queued_ref, + rxrpc_call_seen, + rxrpc_call_got, + rxrpc_call_got_skb, + rxrpc_call_got_userid, + rxrpc_call_put, + rxrpc_call_put_skb, + rxrpc_call_put_userid, + rxrpc_call_put_noqueue, + rxrpc_call__nr_trace +}; + +extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; + #include /* @@ -555,8 +573,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void rxrpc_see_call(struct rxrpc_call *); -void rxrpc_get_call(struct rxrpc_call *); -void rxrpc_put_call(struct rxrpc_call *); +void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); +void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *); void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *); void __exit rxrpc_destroy_all_calls(void); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 68a439e30df1..487ae7aa86db 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -115,7 +115,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, write_lock(&rx->call_lock); if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); spin_lock(&call->conn->state_lock); if (sp->hdr.securityIndex > 0 && @@ -155,7 +155,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, _debug("done"); read_unlock_bh(&local->services_lock); rxrpc_free_skb(notification); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); _leave(" = 0"); return 0; @@ -166,11 +166,11 @@ invalid_service: read_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); rxrpc_queue_call(call); } read_unlock_bh(&call->state_lock); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); ret = -ECONNREFUSED; error: rxrpc_free_skb(notification); @@ -341,6 +341,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, } /* formalise the acceptance */ + rxrpc_get_call(call, rxrpc_call_got_userid); call->notify_rx = notify_rx; call->user_call_ID = user_call_ID; rb_link_node(&call->sock_node, parent, pp); @@ -351,7 +352,6 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, BUG(); rxrpc_queue_call(call); - rxrpc_get_call(call); write_unlock_bh(&call->state_lock); write_unlock(&rx->call_lock); _leave(" = %p{%d}", call, call->debug_id); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 4754c7fb6242..fee8b6ddb334 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -1246,7 +1246,7 @@ send_message_2: kill_ACKs: del_timer_sync(&call->ack_timer); if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); clear_bit(RXRPC_CALL_EV_ACK, &call->events); maybe_reschedule: diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 65691742199b..3166b5222435 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -55,6 +55,21 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_NETWORK_ERROR] = "NetError", }; +const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { + [rxrpc_call_new_client] = "NWc", + [rxrpc_call_new_service] = "NWs", + [rxrpc_call_queued] = "QUE", + [rxrpc_call_queued_ref] = "QUR", + [rxrpc_call_seen] = "SEE", + [rxrpc_call_got] = "GOT", + [rxrpc_call_got_skb] = "Gsk", + [rxrpc_call_got_userid] = "Gus", + [rxrpc_call_put] = "PUT", + [rxrpc_call_put_skb] = "Psk", + [rxrpc_call_put_userid] = "Pus", + [rxrpc_call_put_noqueue] = "PNQ", +}; + struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); @@ -96,7 +111,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, return NULL; found_extant_call: - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); read_unlock(&rx->call_lock); _leave(" = %p [%d]", call, atomic_read(&call->usage)); return call; @@ -252,8 +267,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, goto found_user_ID_now_present; } - rxrpc_get_call(call); - + rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); write_unlock(&rx->call_lock); @@ -275,7 +289,7 @@ error: write_lock(&rx->call_lock); rb_erase(&call->sock_node, &rx->calls); write_unlock(&rx->call_lock); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put_userid); write_lock_bh(&rxrpc_call_lock); list_del_init(&call->link); @@ -283,7 +297,7 @@ error: set_bit(RXRPC_CALL_RELEASED, &call->flags); call->state = RXRPC_CALL_DEAD; - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ERR_PTR(ret); @@ -296,7 +310,7 @@ found_user_ID_now_present: write_unlock(&rx->call_lock); set_bit(RXRPC_CALL_RELEASED, &call->flags); call->state = RXRPC_CALL_DEAD; - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); _leave(" = -EEXIST [%p]", call); return ERR_PTR(-EEXIST); } @@ -322,8 +336,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, if (!candidate) return ERR_PTR(-EBUSY); - trace_rxrpc_call(candidate, 1, atomic_read(&candidate->usage), - 0, here, NULL); + trace_rxrpc_call(candidate, rxrpc_call_new_service, + atomic_read(&candidate->usage), 0, here, NULL); chan = sp->hdr.cid & RXRPC_CHANNELMASK; candidate->socket = rx; @@ -358,7 +372,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, read_unlock(&call->state_lock); goto aborted_call; default: - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); read_unlock(&call->state_lock); goto extant_call; } @@ -447,20 +461,20 @@ void rxrpc_see_call(struct rxrpc_call *call) int n = atomic_read(&call->usage); int m = atomic_read(&call->skb_count); - trace_rxrpc_call(call, 2, n, m, here, 0); + trace_rxrpc_call(call, rxrpc_call_seen, n, m, here, NULL); } } /* * Note the addition of a ref on a call. */ -void rxrpc_get_call(struct rxrpc_call *call) +void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); int m = atomic_read(&call->skb_count); - trace_rxrpc_call(call, 3, n, m, here, 0); + trace_rxrpc_call(call, op, n, m, here, NULL); } /* @@ -472,7 +486,7 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) int n = atomic_inc_return(&call->usage); int m = atomic_inc_return(&call->skb_count); - trace_rxrpc_call(call, 4, n, m, here, skb); + trace_rxrpc_call(call, rxrpc_call_got_skb, n, m, here, skb); } /* @@ -575,7 +589,7 @@ static void rxrpc_dead_call_expired(unsigned long _call) write_lock_bh(&call->state_lock); call->state = RXRPC_CALL_DEAD; write_unlock_bh(&call->state_lock); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); } /* @@ -632,7 +646,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) /* * release a call */ -void rxrpc_put_call(struct rxrpc_call *call) +void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { const void *here = __builtin_return_address(0); int n, m; @@ -641,7 +655,7 @@ void rxrpc_put_call(struct rxrpc_call *call) n = atomic_dec_return(&call->usage); m = atomic_read(&call->skb_count); - trace_rxrpc_call(call, 5, n, m, here, NULL); + trace_rxrpc_call(call, op, n, m, here, NULL); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); @@ -661,7 +675,7 @@ void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) n = atomic_dec_return(&call->usage); m = atomic_dec_return(&call->skb_count); - trace_rxrpc_call(call, 6, n, m, here, skb); + trace_rxrpc_call(call, rxrpc_call_put_skb, n, m, here, skb); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 72f016cfaaf5..f7239a6f9181 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -537,7 +537,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, } read_unlock(&call->state_lock); - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.flags & RXRPC_JUMBO_PACKET) @@ -545,12 +545,12 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, else rxrpc_fast_process_packet(call, skb); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); goto done; resend_final_ack: _debug("final ack again"); - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); rxrpc_queue_call(call); goto free_unlock; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 0ab7b334bab1..97f8ee76c67c 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -79,7 +79,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); if (continue_call) - rxrpc_put_call(continue_call); + rxrpc_put_call(continue_call, + rxrpc_call_put); return -ENODATA; } } @@ -137,13 +138,13 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (call != continue_call || skb->mark != RXRPC_SKB_MARK_DATA) { release_sock(&rx->sk); - rxrpc_put_call(continue_call); + rxrpc_put_call(continue_call, rxrpc_call_put); _leave(" = %d [noncont]", copied); return copied; } } - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); /* copy the peer address and timestamp */ if (!continue_call) { @@ -233,7 +234,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (!continue_call) continue_call = sp->call; else - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); call = NULL; if (flags & MSG_PEEK) { @@ -255,9 +256,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, out: release_sock(&rx->sk); if (call) - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); if (continue_call) - rxrpc_put_call(continue_call); + rxrpc_put_call(continue_call, rxrpc_call_put); _leave(" = %d [data]", copied); return copied; @@ -341,18 +342,18 @@ terminal_message: } release_sock(&rx->sk); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); if (continue_call) - rxrpc_put_call(continue_call); + rxrpc_put_call(continue_call, rxrpc_call_put); _leave(" = %d", ret); return ret; copy_error: _debug("copy error"); release_sock(&rx->sk); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); if (continue_call) - rxrpc_put_call(continue_call); + rxrpc_put_call(continue_call, rxrpc_call_put); _leave(" = %d", ret); return ret; @@ -361,7 +362,7 @@ wait_interrupted: wait_error: finish_wait(sk_sleep(&rx->sk), &wait); if (continue_call) - rxrpc_put_call(continue_call); + rxrpc_put_call(continue_call, rxrpc_call_put); if (copied) copied = ret; _leave(" = %d [waitfail %d]", copied, ret); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 7376794a0308..803078bea507 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -534,7 +534,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) call = rxrpc_accept_call(rx, user_call_ID, NULL); if (IS_ERR(call)) return PTR_ERR(call); - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); return 0; } @@ -573,7 +573,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len); } - rxrpc_put_call(call); + rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index a546a2ba6341..c0613ab6d2d5 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -35,7 +35,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call) /* get an extra ref on the call for the final-ACK generator to * release */ - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); if (try_to_del_timer_sync(&call->ack_timer) >= 0) rxrpc_queue_call(call); -- cgit v1.2.3 From 5a42976d4fe5d7fddce133de995c742c87b1b7e3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 6 Sep 2016 22:19:51 +0100 Subject: rxrpc: Add tracepoint for working out where aborts happen Add a tracepoint for working out where local aborts happen. Each tracepoint call is labelled with a 3-letter code so that they can be distinguished - and the DATA sequence number is added too where available. rxrpc_kernel_abort_call() also takes a 3-letter code so that AFS can indicate the circumstances when it aborts a call. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 17 ++++--- include/net/af_rxrpc.h | 3 +- include/trace/events/rxrpc.h | 29 ++++++++++++ net/rxrpc/ar-internal.h | 14 ++++-- net/rxrpc/call_event.c | 7 +-- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_event.c | 6 +++ net/rxrpc/input.c | 7 +-- net/rxrpc/insecure.c | 19 ++++---- net/rxrpc/rxkad.c | 108 +++++++++++++++++++------------------------ net/rxrpc/sendmsg.c | 18 ++++---- 11 files changed, 132 insertions(+), 98 deletions(-) (limited to 'include') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 37608be52abd..53750dece80e 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -377,7 +377,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return wait_mode->wait(call); error_do_abort: - rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); error_kill_call: afs_end_call(call); _leave(" = %d", ret); @@ -425,12 +425,12 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code); + abort_code, -ret, "KNC"); goto do_abort; case -ENOTSUPP: abort_code = RX_INVALID_OPERATION; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code); + abort_code, -ret, "KIV"); goto do_abort; case -ENODATA: case -EBADMSG: @@ -440,7 +440,7 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code); + abort_code, EBADMSG, "KUM"); goto do_abort; } } @@ -463,6 +463,7 @@ do_abort: */ static int afs_wait_for_call_to_complete(struct afs_call *call) { + const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -481,9 +482,11 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) continue; } + abort_why = "KWC"; ret = call->error; if (call->state == AFS_CALL_COMPLETE) break; + abort_why = "KWI"; ret = -EINTR; if (signal_pending(current)) break; @@ -497,7 +500,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_CALL_DEAD); + RX_CALL_DEAD, -ret, abort_why); } _debug("call complete"); @@ -695,7 +698,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT); + RX_USER_ABORT, ENOMEM, "KOO"); default: afs_end_call(call); _leave(" [error]"); @@ -734,7 +737,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT); + RX_USER_ABORT, ENOMEM, "KOO"); } afs_end_call(call); _leave(" [error]"); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index b4b6a3664dda..08ed8729126c 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -35,7 +35,8 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, void *, size_t, size_t *, bool, u32 *); -void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32); +void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, + u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long, rxrpc_notify_rx_t); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 30164896f1f6..85ee035774ae 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -84,6 +84,35 @@ TRACE_EVENT(rxrpc_skb, __entry->where) ); +TRACE_EVENT(rxrpc_abort, + TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq, + int abort_code, int error), + + TP_ARGS(why, cid, call_id, seq, abort_code, error), + + TP_STRUCT__entry( + __array(char, why, 4 ) + __field(u32, cid ) + __field(u32, call_id ) + __field(rxrpc_seq_t, seq ) + __field(int, abort_code ) + __field(int, error ) + ), + + TP_fast_assign( + memcpy(__entry->why, why, 4); + __entry->cid = cid; + __entry->call_id = call_id; + __entry->abort_code = abort_code; + __entry->error = error; + __entry->seq = seq; + ), + + TP_printk("%08x:%08x s=%u a=%d e=%d %s", + __entry->cid, __entry->call_id, __entry->seq, + __entry->abort_code, __entry->error, __entry->why) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0353399792b6..dbfb9ed17483 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -155,7 +155,8 @@ struct rxrpc_security { void *); /* verify the security on a received packet */ - int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *); + int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, + rxrpc_seq_t, u16); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); @@ -637,9 +638,12 @@ static inline bool rxrpc_call_completed(struct rxrpc_call *call) /* * Record that a call is locally aborted. */ -static inline bool __rxrpc_abort_call(struct rxrpc_call *call, +static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) { + trace_rxrpc_abort(why, call->cid, call->call_id, seq, + abort_code, error); if (__rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error)) { @@ -649,13 +653,13 @@ static inline bool __rxrpc_abort_call(struct rxrpc_call *call, return false; } -static inline bool rxrpc_abort_call(struct rxrpc_call *call, - u32 abort_code, int error) +static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) { bool ret; write_lock_bh(&call->state_lock); - ret = __rxrpc_abort_call(call, abort_code, error); + ret = __rxrpc_abort_call(why, call, seq, abort_code, error); write_unlock_bh(&call->state_lock); return ret; } diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8365d3366114..af88ad7d2cf9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -598,7 +598,8 @@ process_further: /* secured packets must be verified and possibly decrypted */ if (call->conn->security->verify_packet(call, skb, - _abort_code) < 0) + sp->hdr.seq, + sp->hdr.cksum) < 0) goto protocol_error; rxrpc_insert_oos_packet(call, skb); @@ -982,7 +983,7 @@ skip_msg_init: } if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { - rxrpc_abort_call(call, RX_CALL_TIMEOUT, ETIME); + rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); _debug("post timeout"); if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, @@ -1005,7 +1006,7 @@ skip_msg_init: case -EKEYEXPIRED: case -EKEYREJECTED: case -EPROTO: - rxrpc_abort_call(call, abort_code, -ret); + rxrpc_abort_call("PRO", call, 0, abort_code, -ret); goto kill_ACKs; } } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index be5733d55794..9efd9b0b0bdf 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -563,7 +563,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) if (call->state < RXRPC_CALL_COMPLETE) { _debug("+++ ABORTING STATE %d +++\n", call->state); - __rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET); + __rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET); clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9db90f4f768d..8c7938ba6a84 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -158,6 +158,11 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, lockdep_is_held(&conn->channel_lock)); if (call) { rxrpc_see_call(call); + if (compl == RXRPC_CALL_LOCALLY_ABORTED) + trace_rxrpc_abort("CON", call->cid, + call->call_id, 0, + abort_code, error); + write_lock_bh(&call->state_lock); if (rxrpc_set_call_completion(call, compl, abort_code, error)) { @@ -167,6 +172,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, write_unlock_bh(&call->state_lock); if (queue) rxrpc_queue_call(call); + } } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 79f3f585cdc3..8e624109750a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -429,7 +429,7 @@ protocol_error: _debug("protocol error"); write_lock_bh(&call->state_lock); protocol_error_locked: - if (__rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO)) + if (__rxrpc_abort_call("FPR", call, 0, RX_PROTOCOL_ERROR, EPROTO)) rxrpc_queue_call(call); free_packet_unlock: write_unlock_bh(&call->state_lock); @@ -495,9 +495,10 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, protocol_error: _debug("protocol error"); rxrpc_free_skb(part); - rxrpc_free_skb(jumbo); - if (rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO)) + if (rxrpc_abort_call("PJP", call, sp->hdr.seq, + RX_PROTOCOL_ERROR, EPROTO)) rxrpc_queue_call(call); + rxrpc_free_skb(jumbo); _leave(""); } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index c21ad213b337..a4aba0246731 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -23,31 +23,32 @@ static int none_prime_packet_security(struct rxrpc_connection *conn) } static int none_secure_packet(struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) + struct sk_buff *skb, + size_t data_size, + void *sechdr) { return 0; } static int none_verify_packet(struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb, + rxrpc_seq_t seq, + u16 expected_cksum) { return 0; } static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb, + u32 *_abort_code) { *_abort_code = RX_PROTOCOL_ERROR; return -EPROTO; } static int none_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb, + u32 *_abort_code) { *_abort_code = RX_PROTOCOL_ERROR; return -EPROTO; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 89f475febfd7..3777432df10b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -316,12 +316,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call, /* * decrypt partial encryption on a packet (level 1 security) */ -static int rxkad_verify_packet_auth(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, + rxrpc_seq_t seq) { struct rxkad_level1_hdr sechdr; - struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[16]; @@ -332,7 +330,10 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, _enter(""); - sp = rxrpc_skb(skb); + if (skb->len < 8) { + rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO); + goto protocol_error; + } /* we want to decrypt the skbuff in-place */ nsg = skb_cow_data(skb, 0, &trailer); @@ -351,9 +352,11 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, crypto_skcipher_decrypt(req); skcipher_request_zero(req); - /* remove the decrypted packet length */ - if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) - goto datalen_error; + /* Extract the decrypted packet length */ + if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) { + rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } if (!skb_pull(skb, sizeof(sechdr))) BUG(); @@ -361,24 +364,24 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, data_size = buf & 0xffff; check = buf >> 16; - check ^= sp->hdr.seq ^ sp->hdr.callNumber; + check ^= seq ^ call->call_id; check &= 0xffff; if (check != 0) { - *_abort_code = RXKADSEALEDINCON; + rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO); goto protocol_error; } /* shorten the packet to remove the padding */ - if (data_size > skb->len) - goto datalen_error; - else if (data_size < skb->len) + if (data_size > skb->len) { + rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } + if (data_size < skb->len) skb->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; -datalen_error: - *_abort_code = RXKADDATALEN; protocol_error: _leave(" = -EPROTO"); return -EPROTO; @@ -391,13 +394,11 @@ nomem: /* * wholly decrypt a packet (level 2 security) */ -static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, + rxrpc_seq_t seq) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr sechdr; - struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; @@ -408,7 +409,10 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, _enter(",{%d}", skb->len); - sp = rxrpc_skb(skb); + if (skb->len < 8) { + rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO); + goto protocol_error; + } /* we want to decrypt the skbuff in-place */ nsg = skb_cow_data(skb, 0, &trailer); @@ -437,9 +441,11 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, if (sg != _sg) kfree(sg); - /* remove the decrypted packet length */ - if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) - goto datalen_error; + /* Extract the decrypted packet length */ + if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) { + rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } if (!skb_pull(skb, sizeof(sechdr))) BUG(); @@ -447,24 +453,23 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, data_size = buf & 0xffff; check = buf >> 16; - check ^= sp->hdr.seq ^ sp->hdr.callNumber; + check ^= seq ^ call->call_id; check &= 0xffff; if (check != 0) { - *_abort_code = RXKADSEALEDINCON; + rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO); goto protocol_error; } - /* shorten the packet to remove the padding */ - if (data_size > skb->len) - goto datalen_error; - else if (data_size < skb->len) + if (data_size > skb->len) { + rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } + if (data_size < skb->len) skb->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; -datalen_error: - *_abort_code = RXKADDATALEN; protocol_error: _leave(" = -EPROTO"); return -EPROTO; @@ -475,40 +480,30 @@ nomem: } /* - * verify the security on a received packet + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). */ -static int rxkad_verify_packet(struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + rxrpc_seq_t seq, u16 expected_cksum) { SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); - struct rxrpc_skb_priv *sp; struct rxrpc_crypt iv; struct scatterlist sg; u16 cksum; u32 x, y; - int ret; - - sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq); + call->debug_id, key_serial(call->conn->params.key), seq); if (!call->conn->cipher) return 0; - if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) { - *_abort_code = RXKADINCONSISTENCY; - _leave(" = -EPROTO [not rxkad]"); - return -EPROTO; - } - /* continue encrypting from where we left off */ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); /* validate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); - x |= sp->hdr.seq & 0x3fffffff; + x |= seq & 0x3fffffff; call->crypto_buf[0] = htonl(call->call_id); call->crypto_buf[1] = htonl(x); @@ -524,29 +519,22 @@ static int rxkad_verify_packet(struct rxrpc_call *call, if (cksum == 0) cksum = 1; /* zero checksums are not permitted */ - if (sp->hdr.cksum != cksum) { - *_abort_code = RXKADSEALEDINCON; + if (cksum != expected_cksum) { + rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO); _leave(" = -EPROTO [csum failed]"); return -EPROTO; } switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: - ret = 0; - break; + return 0; case RXRPC_SECURITY_AUTH: - ret = rxkad_verify_packet_auth(call, skb, _abort_code); - break; + return rxkad_verify_packet_1(call, skb, seq); case RXRPC_SECURITY_ENCRYPT: - ret = rxkad_verify_packet_encrypt(call, skb, _abort_code); - break; + return rxkad_verify_packet_2(call, skb, seq); default: - ret = -ENOANO; - break; + return -ENOANO; } - - _leave(" = %d", ret); - return ret; } /* diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 2439aff131c7..9a4af992fcdf 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -454,14 +454,15 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, /* * abort a call, sending an ABORT packet to the peer */ -static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) +static void rxrpc_send_abort(struct rxrpc_call *call, const char *why, + u32 abort_code, int error) { if (call->state >= RXRPC_CALL_COMPLETE) return; write_lock_bh(&call->state_lock); - if (__rxrpc_abort_call(call, abort_code, ECONNABORTED)) { + if (__rxrpc_abort_call(why, call, 0, abort_code, error)) { del_timer_sync(&call->resend_timer); del_timer_sync(&call->ack_timer); clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); @@ -556,7 +557,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* it's too late for this call */ ret = -ESHUTDOWN; } else if (cmd == RXRPC_CMD_SEND_ABORT) { - rxrpc_send_abort(call, abort_code); + rxrpc_send_abort(call, "CMD", abort_code, ECONNABORTED); ret = 0; } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; @@ -626,20 +627,19 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @sock: The socket the call is on * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: 3-char string indicating why. * * Allow a kernel service to abort a call, if it's still in an abortable state. */ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, - u32 abort_code) + u32 abort_code, int error, const char *why) { - _enter("{%d},%d", call->debug_id, abort_code); + _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); lock_sock(sock->sk); - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - rxrpc_send_abort(call, abort_code); + rxrpc_send_abort(call, why, abort_code, error); release_sock(sock->sk); _leave(""); -- cgit v1.2.3 From c965db44462919f613973aa618271f6c3f5a1e64 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 7 Sep 2016 16:36:24 +0300 Subject: qed: Add support for debug data collection This patch adds the support for dumping and formatting the HW/FW debug data. Signed-off-by: Tomer Tayar Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/Makefile | 2 +- drivers/net/ethernet/qlogic/qed/qed.h | 20 + drivers/net/ethernet/qlogic/qed/qed_debug.c | 6898 ++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_debug.h | 54 + drivers/net/ethernet/qlogic/qed/qed_hsi.h | 1056 +++- drivers/net/ethernet/qlogic/qed/qed_main.c | 4 + drivers/net/ethernet/qlogic/qed/qed_reg_addr.h | 894 +++ include/linux/qed/common_hsi.h | 3 + 8 files changed, 8919 insertions(+), 12 deletions(-) create mode 100644 drivers/net/ethernet/qlogic/qed/qed_debug.c create mode 100644 drivers/net/ethernet/qlogic/qed/qed_debug.h (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile index d1f157e439cf..86a5b4f5f870 100644 --- a/drivers/net/ethernet/qlogic/qed/Makefile +++ b/drivers/net/ethernet/qlogic/qed/Makefile @@ -2,5 +2,5 @@ obj-$(CONFIG_QED) := qed.o qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \ qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \ - qed_selftest.o qed_dcbx.o + qed_selftest.o qed_dcbx.o qed_debug.o qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index 2d67469eb8f6..0929582fc82b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -23,6 +23,7 @@ #include #include #include +#include "qed_debug.h" #include "qed_hsi.h" extern const struct qed_common_ops qed_common_ops_pass; @@ -395,6 +396,8 @@ struct qed_hwfn { /* Buffer for unzipping firmware data */ void *unzip_buf; + struct dbg_tools_data dbg_info; + struct qed_simd_fp_handler simd_proto_handler[64]; #ifdef CONFIG_QED_SRIOV @@ -430,6 +433,19 @@ struct qed_int_params { u8 fp_msix_cnt; }; +struct qed_dbg_feature { + struct dentry *dentry; + u8 *dump_buf; + u32 buf_size; + u32 dumped_dwords; +}; + +struct qed_dbg_params { + struct qed_dbg_feature features[DBG_FEATURE_NUM]; + u8 engine_for_debug; + bool print_data; +}; + struct qed_dev { u32 dp_module; u8 dp_level; @@ -444,6 +460,8 @@ struct qed_dev { CHIP_REV_IS_A0(dev)) #define QED_IS_BB_B0(dev) (QED_IS_BB(dev) && \ CHIP_REV_IS_B0(dev)) +#define QED_IS_AH(dev) ((dev)->type == QED_DEV_TYPE_AH) +#define QED_IS_K2(dev) QED_IS_AH(dev) #define QED_GET_TYPE(dev) (QED_IS_BB_A0(dev) ? CHIP_BB_A0 : \ QED_IS_BB_B0(dev) ? CHIP_BB_B0 : CHIP_K2) @@ -544,6 +562,8 @@ struct qed_dev { } protocol_ops; void *ops_cookie; + struct qed_dbg_params dbg_params; + const struct firmware *firmware; }; diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c new file mode 100644 index 000000000000..88e7d5bef909 --- /dev/null +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -0,0 +1,6898 @@ +/* QLogic qed NIC Driver + * Copyright (c) 2015 QLogic Corporation + * + * This software is available under the terms of the GNU General Public License + * (GPL) Version 2, available from the file COPYING in the main directory of + * this source tree. + */ + +#include +#include +#include +#include "qed.h" +#include "qed_hsi.h" +#include "qed_hw.h" +#include "qed_mcp.h" +#include "qed_reg_addr.h" + +/* Chip IDs enum */ +enum chip_ids { + CHIP_RESERVED, + CHIP_BB_B0, + CHIP_K2, + MAX_CHIP_IDS +}; + +/* Memory groups enum */ +enum mem_groups { + MEM_GROUP_PXP_MEM, + MEM_GROUP_DMAE_MEM, + MEM_GROUP_CM_MEM, + MEM_GROUP_QM_MEM, + MEM_GROUP_TM_MEM, + MEM_GROUP_BRB_RAM, + MEM_GROUP_BRB_MEM, + MEM_GROUP_PRS_MEM, + MEM_GROUP_SDM_MEM, + MEM_GROUP_PBUF, + MEM_GROUP_IOR, + MEM_GROUP_RAM, + MEM_GROUP_BTB_RAM, + MEM_GROUP_RDIF_CTX, + MEM_GROUP_TDIF_CTX, + MEM_GROUP_CONN_CFC_MEM, + MEM_GROUP_TASK_CFC_MEM, + MEM_GROUP_CAU_PI, + MEM_GROUP_CAU_MEM, + MEM_GROUP_PXP_ILT, + MEM_GROUP_MULD_MEM, + MEM_GROUP_BTB_MEM, + MEM_GROUP_IGU_MEM, + MEM_GROUP_IGU_MSIX, + MEM_GROUP_CAU_SB, + MEM_GROUP_BMB_RAM, + MEM_GROUP_BMB_MEM, + MEM_GROUPS_NUM +}; + +/* Memory groups names */ +static const char * const s_mem_group_names[] = { + "PXP_MEM", + "DMAE_MEM", + "CM_MEM", + "QM_MEM", + "TM_MEM", + "BRB_RAM", + "BRB_MEM", + "PRS_MEM", + "SDM_MEM", + "PBUF", + "IOR", + "RAM", + "BTB_RAM", + "RDIF_CTX", + "TDIF_CTX", + "CONN_CFC_MEM", + "TASK_CFC_MEM", + "CAU_PI", + "CAU_MEM", + "PXP_ILT", + "MULD_MEM", + "BTB_MEM", + "IGU_MEM", + "IGU_MSIX", + "CAU_SB", + "BMB_RAM", + "BMB_MEM", +}; + +/* Idle check conditions */ +static u32 cond4(const u32 *r, const u32 *imm) +{ + return ((r[0] & imm[0]) != imm[1]) && ((r[1] & imm[2]) != imm[3]); +} + +static u32 cond6(const u32 *r, const u32 *imm) +{ + return ((r[0] >> imm[0]) & imm[1]) != imm[2]; +} + +static u32 cond5(const u32 *r, const u32 *imm) +{ + return (r[0] & imm[0]) != imm[1]; +} + +static u32 cond8(const u32 *r, const u32 *imm) +{ + return ((r[0] & imm[0]) >> imm[1]) != + (((r[0] & imm[2]) >> imm[3]) | ((r[1] & imm[4]) << imm[5])); +} + +static u32 cond9(const u32 *r, const u32 *imm) +{ + return ((r[0] & imm[0]) >> imm[1]) != (r[0] & imm[2]); +} + +static u32 cond1(const u32 *r, const u32 *imm) +{ + return (r[0] & ~imm[0]) != imm[1]; +} + +static u32 cond0(const u32 *r, const u32 *imm) +{ + return r[0] != imm[0]; +} + +static u32 cond10(const u32 *r, const u32 *imm) +{ + return r[0] != r[1] && r[2] == imm[0]; +} + +static u32 cond11(const u32 *r, const u32 *imm) +{ + return r[0] != r[1] && r[2] > imm[0]; +} + +static u32 cond3(const u32 *r, const u32 *imm) +{ + return r[0] != r[1]; +} + +static u32 cond12(const u32 *r, const u32 *imm) +{ + return r[0] & imm[0]; +} + +static u32 cond7(const u32 *r, const u32 *imm) +{ + return r[0] < (r[1] - imm[0]); +} + +static u32 cond2(const u32 *r, const u32 *imm) +{ + return r[0] > imm[0]; +} + +/* Array of Idle Check conditions */ +static u32(*cond_arr[]) (const u32 *r, const u32 *imm) = { + cond0, + cond1, + cond2, + cond3, + cond4, + cond5, + cond6, + cond7, + cond8, + cond9, + cond10, + cond11, + cond12, +}; + +/******************************* Data Types **********************************/ + +enum platform_ids { + PLATFORM_ASIC, + PLATFORM_RESERVED, + PLATFORM_RESERVED2, + PLATFORM_RESERVED3, + MAX_PLATFORM_IDS +}; + +struct dbg_array { + const u32 *ptr; + u32 size_in_dwords; +}; + +/* Chip constant definitions */ +struct chip_defs { + const char *name; + struct { + u8 num_ports; + u8 num_pfs; + } per_platform[MAX_PLATFORM_IDS]; +}; + +/* Platform constant definitions */ +struct platform_defs { + const char *name; + u32 delay_factor; +}; + +/* Storm constant definitions */ +struct storm_defs { + char letter; + enum block_id block_id; + enum dbg_bus_clients dbg_client_id[MAX_CHIP_IDS]; + bool has_vfc; + u32 sem_fast_mem_addr; + u32 sem_frame_mode_addr; + u32 sem_slow_enable_addr; + u32 sem_slow_mode_addr; + u32 sem_slow_mode1_conf_addr; + u32 sem_sync_dbg_empty_addr; + u32 sem_slow_dbg_empty_addr; + u32 cm_ctx_wr_addr; + u32 cm_conn_ag_ctx_lid_size; /* In quad-regs */ + u32 cm_conn_ag_ctx_rd_addr; + u32 cm_conn_st_ctx_lid_size; /* In quad-regs */ + u32 cm_conn_st_ctx_rd_addr; + u32 cm_task_ag_ctx_lid_size; /* In quad-regs */ + u32 cm_task_ag_ctx_rd_addr; + u32 cm_task_st_ctx_lid_size; /* In quad-regs */ + u32 cm_task_st_ctx_rd_addr; +}; + +/* Block constant definitions */ +struct block_defs { + const char *name; + bool has_dbg_bus[MAX_CHIP_IDS]; + bool associated_to_storm; + u32 storm_id; /* Valid only if associated_to_storm is true */ + enum dbg_bus_clients dbg_client_id[MAX_CHIP_IDS]; + u32 dbg_select_addr; + u32 dbg_cycle_enable_addr; + u32 dbg_shift_addr; + u32 dbg_force_valid_addr; + u32 dbg_force_frame_addr; + bool has_reset_bit; + bool unreset; /* If true, the block is taken out of reset before dump */ + enum dbg_reset_regs reset_reg; + u8 reset_bit_offset; /* Bit offset in reset register */ +}; + +/* Reset register definitions */ +struct reset_reg_defs { + u32 addr; + u32 unreset_val; + bool exists[MAX_CHIP_IDS]; +}; + +struct grc_param_defs { + u32 default_val[MAX_CHIP_IDS]; + u32 min; + u32 max; + bool is_preset; + u32 exclude_all_preset_val; + u32 crash_preset_val; +}; + +struct rss_mem_defs { + const char *mem_name; + const char *type_name; + u32 addr; /* In 128b units */ + u32 num_entries[MAX_CHIP_IDS]; + u32 entry_width[MAX_CHIP_IDS]; /* In bits */ +}; + +struct vfc_ram_defs { + const char *mem_name; + const char *type_name; + u32 base_row; + u32 num_rows; +}; + +struct big_ram_defs { + const char *instance_name; + enum mem_groups mem_group_id; + enum mem_groups ram_mem_group_id; + enum dbg_grc_params grc_param; + u32 addr_reg_addr; + u32 data_reg_addr; + u32 num_of_blocks[MAX_CHIP_IDS]; +}; + +struct phy_defs { + const char *phy_name; + u32 base_addr; + u32 tbus_addr_lo_addr; + u32 tbus_addr_hi_addr; + u32 tbus_data_lo_addr; + u32 tbus_data_hi_addr; +}; + +/******************************** Constants **********************************/ + +#define MAX_LCIDS 320 +#define MAX_LTIDS 320 +#define NUM_IOR_SETS 2 +#define IORS_PER_SET 176 +#define IOR_SET_OFFSET(set_id) ((set_id) * 256) +#define BYTES_IN_DWORD sizeof(u32) + +/* In the macros below, size and offset are specified in bits */ +#define CEIL_DWORDS(size) DIV_ROUND_UP(size, 32) +#define FIELD_BIT_OFFSET(type, field) type ## _ ## field ## _ ## OFFSET +#define FIELD_BIT_SIZE(type, field) type ## _ ## field ## _ ## SIZE +#define FIELD_DWORD_OFFSET(type, field) \ + (int)(FIELD_BIT_OFFSET(type, field) / 32) +#define FIELD_DWORD_SHIFT(type, field) (FIELD_BIT_OFFSET(type, field) % 32) +#define FIELD_BIT_MASK(type, field) \ + (((1 << FIELD_BIT_SIZE(type, field)) - 1) << \ + FIELD_DWORD_SHIFT(type, field)) +#define SET_VAR_FIELD(var, type, field, val) \ + do { \ + var[FIELD_DWORD_OFFSET(type, field)] &= \ + (~FIELD_BIT_MASK(type, field)); \ + var[FIELD_DWORD_OFFSET(type, field)] |= \ + (val) << FIELD_DWORD_SHIFT(type, field); \ + } while (0) +#define ARR_REG_WR(dev, ptt, addr, arr, arr_size) \ + do { \ + for (i = 0; i < (arr_size); i++) \ + qed_wr(dev, ptt, addr, (arr)[i]); \ + } while (0) +#define ARR_REG_RD(dev, ptt, addr, arr, arr_size) \ + do { \ + for (i = 0; i < (arr_size); i++) \ + (arr)[i] = qed_rd(dev, ptt, addr); \ + } while (0) + +#define DWORDS_TO_BYTES(dwords) ((dwords) * BYTES_IN_DWORD) +#define BYTES_TO_DWORDS(bytes) ((bytes) / BYTES_IN_DWORD) +#define RAM_LINES_TO_DWORDS(lines) ((lines) * 2) +#define RAM_LINES_TO_BYTES(lines) \ + DWORDS_TO_BYTES(RAM_LINES_TO_DWORDS(lines)) +#define REG_DUMP_LEN_SHIFT 24 +#define MEM_DUMP_ENTRY_SIZE_DWORDS \ + BYTES_TO_DWORDS(sizeof(struct dbg_dump_mem)) +#define IDLE_CHK_RULE_SIZE_DWORDS \ + BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_rule)) +#define IDLE_CHK_RESULT_HDR_DWORDS \ + BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_result_hdr)) +#define IDLE_CHK_RESULT_REG_HDR_DWORDS \ + BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_result_reg_hdr)) +#define IDLE_CHK_MAX_ENTRIES_SIZE 32 + +/* The sizes and offsets below are specified in bits */ +#define VFC_CAM_CMD_STRUCT_SIZE 64 +#define VFC_CAM_CMD_ROW_OFFSET 48 +#define VFC_CAM_CMD_ROW_SIZE 9 +#define VFC_CAM_ADDR_STRUCT_SIZE 16 +#define VFC_CAM_ADDR_OP_OFFSET 0 +#define VFC_CAM_ADDR_OP_SIZE 4 +#define VFC_CAM_RESP_STRUCT_SIZE 256 +#define VFC_RAM_ADDR_STRUCT_SIZE 16 +#define VFC_RAM_ADDR_OP_OFFSET 0 +#define VFC_RAM_ADDR_OP_SIZE 2 +#define VFC_RAM_ADDR_ROW_OFFSET 2 +#define VFC_RAM_ADDR_ROW_SIZE 10 +#define VFC_RAM_RESP_STRUCT_SIZE 256 +#define VFC_CAM_CMD_DWORDS CEIL_DWORDS(VFC_CAM_CMD_STRUCT_SIZE) +#define VFC_CAM_ADDR_DWORDS CEIL_DWORDS(VFC_CAM_ADDR_STRUCT_SIZE) +#define VFC_CAM_RESP_DWORDS CEIL_DWORDS(VFC_CAM_RESP_STRUCT_SIZE) +#define VFC_RAM_CMD_DWORDS VFC_CAM_CMD_DWORDS +#define VFC_RAM_ADDR_DWORDS CEIL_DWORDS(VFC_RAM_ADDR_STRUCT_SIZE) +#define VFC_RAM_RESP_DWORDS CEIL_DWORDS(VFC_RAM_RESP_STRUCT_SIZE) +#define NUM_VFC_RAM_TYPES 4 +#define VFC_CAM_NUM_ROWS 512 +#define VFC_OPCODE_CAM_RD 14 +#define VFC_OPCODE_RAM_RD 0 +#define NUM_RSS_MEM_TYPES 5 +#define NUM_BIG_RAM_TYPES 3 +#define BIG_RAM_BLOCK_SIZE_BYTES 128 +#define BIG_RAM_BLOCK_SIZE_DWORDS \ + BYTES_TO_DWORDS(BIG_RAM_BLOCK_SIZE_BYTES) +#define NUM_PHY_TBUS_ADDRESSES 2048 +#define PHY_DUMP_SIZE_DWORDS (NUM_PHY_TBUS_ADDRESSES / 2) +#define RESET_REG_UNRESET_OFFSET 4 +#define STALL_DELAY_MS 500 +#define STATIC_DEBUG_LINE_DWORDS 9 +#define NUM_DBG_BUS_LINES 256 +#define NUM_COMMON_GLOBAL_PARAMS 8 +#define FW_IMG_MAIN 1 +#define REG_FIFO_DEPTH_ELEMENTS 32 +#define REG_FIFO_ELEMENT_DWORDS 2 +#define REG_FIFO_DEPTH_DWORDS \ + (REG_FIFO_ELEMENT_DWORDS * REG_FIFO_DEPTH_ELEMENTS) +#define IGU_FIFO_DEPTH_ELEMENTS 64 +#define IGU_FIFO_ELEMENT_DWORDS 4 +#define IGU_FIFO_DEPTH_DWORDS \ + (IGU_FIFO_ELEMENT_DWORDS * IGU_FIFO_DEPTH_ELEMENTS) +#define PROTECTION_OVERRIDE_DEPTH_ELEMENTS 20 +#define PROTECTION_OVERRIDE_ELEMENT_DWORDS 2 +#define PROTECTION_OVERRIDE_DEPTH_DWORDS \ + (PROTECTION_OVERRIDE_DEPTH_ELEMENTS * \ + PROTECTION_OVERRIDE_ELEMENT_DWORDS) +#define MCP_SPAD_TRACE_OFFSIZE_ADDR \ + (MCP_REG_SCRATCH + \ + offsetof(struct static_init, sections[SPAD_SECTION_TRACE])) +#define MCP_TRACE_META_IMAGE_SIGNATURE 0x669955aa +#define EMPTY_FW_VERSION_STR "???_???_???_???" +#define EMPTY_FW_IMAGE_STR "???????????????" + +/***************************** Constant Arrays *******************************/ + +/* Debug arrays */ +static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {0} }; + +/* Chip constant definitions array */ +static struct chip_defs s_chip_defs[MAX_CHIP_IDS] = { + { "reserved", { {0, 0}, {0, 0}, {0, 0}, {0, 0} } }, + { "bb_b0", + { {MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB}, {0, 0}, {0, 0}, {0, 0} } }, + { "k2", { {MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2}, {0, 0}, {0, 0}, {0, 0} } } +}; + +/* Storm constant definitions array */ +static struct storm_defs s_storm_defs[] = { + /* Tstorm */ + {'T', BLOCK_TSEM, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, + DBG_BUS_CLIENT_RBCT}, true, + TSEM_REG_FAST_MEMORY, + TSEM_REG_DBG_FRAME_MODE, TSEM_REG_SLOW_DBG_ACTIVE, + TSEM_REG_SLOW_DBG_MODE, TSEM_REG_DBG_MODE1_CFG, + TSEM_REG_SYNC_DBG_EMPTY, TSEM_REG_SLOW_DBG_EMPTY, + TCM_REG_CTX_RBC_ACCS, + 4, TCM_REG_AGG_CON_CTX, + 16, TCM_REG_SM_CON_CTX, + 2, TCM_REG_AGG_TASK_CTX, + 4, TCM_REG_SM_TASK_CTX}, + /* Mstorm */ + {'M', BLOCK_MSEM, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, + DBG_BUS_CLIENT_RBCM}, false, + MSEM_REG_FAST_MEMORY, + MSEM_REG_DBG_FRAME_MODE, MSEM_REG_SLOW_DBG_ACTIVE, + MSEM_REG_SLOW_DBG_MODE, MSEM_REG_DBG_MODE1_CFG, + MSEM_REG_SYNC_DBG_EMPTY, MSEM_REG_SLOW_DBG_EMPTY, + MCM_REG_CTX_RBC_ACCS, + 1, MCM_REG_AGG_CON_CTX, + 10, MCM_REG_SM_CON_CTX, + 2, MCM_REG_AGG_TASK_CTX, + 7, MCM_REG_SM_TASK_CTX}, + /* Ustorm */ + {'U', BLOCK_USEM, + {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, + DBG_BUS_CLIENT_RBCU}, false, + USEM_REG_FAST_MEMORY, + USEM_REG_DBG_FRAME_MODE, USEM_REG_SLOW_DBG_ACTIVE, + USEM_REG_SLOW_DBG_MODE, USEM_REG_DBG_MODE1_CFG, + USEM_REG_SYNC_DBG_EMPTY, USEM_REG_SLOW_DBG_EMPTY, + UCM_REG_CTX_RBC_ACCS, + 2, UCM_REG_AGG_CON_CTX, + 13, UCM_REG_SM_CON_CTX, + 3, UCM_REG_AGG_TASK_CTX, + 3, UCM_REG_SM_TASK_CTX}, + /* Xstorm */ + {'X', BLOCK_XSEM, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, + DBG_BUS_CLIENT_RBCX}, false, + XSEM_REG_FAST_MEMORY, + XSEM_REG_DBG_FRAME_MODE, XSEM_REG_SLOW_DBG_ACTIVE, + XSEM_REG_SLOW_DBG_MODE, XSEM_REG_DBG_MODE1_CFG, + XSEM_REG_SYNC_DBG_EMPTY, XSEM_REG_SLOW_DBG_EMPTY, + XCM_REG_CTX_RBC_ACCS, + 9, XCM_REG_AGG_CON_CTX, + 15, XCM_REG_SM_CON_CTX, + 0, 0, + 0, 0}, + /* Ystorm */ + {'Y', BLOCK_YSEM, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, + DBG_BUS_CLIENT_RBCY}, false, + YSEM_REG_FAST_MEMORY, + YSEM_REG_DBG_FRAME_MODE, YSEM_REG_SLOW_DBG_ACTIVE, + YSEM_REG_SLOW_DBG_MODE, YSEM_REG_DBG_MODE1_CFG, + YSEM_REG_SYNC_DBG_EMPTY, TSEM_REG_SLOW_DBG_EMPTY, + YCM_REG_CTX_RBC_ACCS, + 2, YCM_REG_AGG_CON_CTX, + 3, YCM_REG_SM_CON_CTX, + 2, YCM_REG_AGG_TASK_CTX, + 12, YCM_REG_SM_TASK_CTX}, + /* Pstorm */ + {'P', BLOCK_PSEM, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, + DBG_BUS_CLIENT_RBCS}, true, + PSEM_REG_FAST_MEMORY, + PSEM_REG_DBG_FRAME_MODE, PSEM_REG_SLOW_DBG_ACTIVE, + PSEM_REG_SLOW_DBG_MODE, PSEM_REG_DBG_MODE1_CFG, + PSEM_REG_SYNC_DBG_EMPTY, PSEM_REG_SLOW_DBG_EMPTY, + PCM_REG_CTX_RBC_ACCS, + 0, 0, + 10, PCM_REG_SM_CON_CTX, + 0, 0, + 0, 0} +}; + +/* Block definitions array */ +static struct block_defs block_grc_defs = { + "grc", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN}, + GRC_REG_DBG_SELECT, GRC_REG_DBG_DWORD_ENABLE, + GRC_REG_DBG_SHIFT, GRC_REG_DBG_FORCE_VALID, + GRC_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_UA, 1 +}; + +static struct block_defs block_miscs_defs = { + "miscs", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_misc_defs = { + "misc", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_dbu_defs = { + "dbu", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_pglue_b_defs = { + "pglue_b", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH}, + PGLUE_B_REG_DBG_SELECT, PGLUE_B_REG_DBG_DWORD_ENABLE, + PGLUE_B_REG_DBG_SHIFT, PGLUE_B_REG_DBG_FORCE_VALID, + PGLUE_B_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISCS_PL_HV, 1 +}; + +static struct block_defs block_cnig_defs = { + "cnig", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW}, + CNIG_REG_DBG_SELECT_K2, CNIG_REG_DBG_DWORD_ENABLE_K2, + CNIG_REG_DBG_SHIFT_K2, CNIG_REG_DBG_FORCE_VALID_K2, + CNIG_REG_DBG_FORCE_FRAME_K2, + true, false, DBG_RESET_REG_MISCS_PL_HV, 0 +}; + +static struct block_defs block_cpmu_defs = { + "cpmu", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, false, DBG_RESET_REG_MISCS_PL_HV, 8 +}; + +static struct block_defs block_ncsi_defs = { + "ncsi", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ}, + NCSI_REG_DBG_SELECT, NCSI_REG_DBG_DWORD_ENABLE, + NCSI_REG_DBG_SHIFT, NCSI_REG_DBG_FORCE_VALID, + NCSI_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISCS_PL_HV, 5 +}; + +static struct block_defs block_opte_defs = { + "opte", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, false, DBG_RESET_REG_MISCS_PL_HV, 4 +}; + +static struct block_defs block_bmb_defs = { + "bmb", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCB}, + BMB_REG_DBG_SELECT, BMB_REG_DBG_DWORD_ENABLE, + BMB_REG_DBG_SHIFT, BMB_REG_DBG_FORCE_VALID, + BMB_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISCS_PL_UA, 7 +}; + +static struct block_defs block_pcie_defs = { + "pcie", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH}, + PCIE_REG_DBG_COMMON_SELECT, PCIE_REG_DBG_COMMON_DWORD_ENABLE, + PCIE_REG_DBG_COMMON_SHIFT, PCIE_REG_DBG_COMMON_FORCE_VALID, + PCIE_REG_DBG_COMMON_FORCE_FRAME, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_mcp_defs = { + "mcp", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_mcp2_defs = { + "mcp2", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ}, + MCP2_REG_DBG_SELECT, MCP2_REG_DBG_DWORD_ENABLE, + MCP2_REG_DBG_SHIFT, MCP2_REG_DBG_FORCE_VALID, + MCP2_REG_DBG_FORCE_FRAME, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_pswhst_defs = { + "pswhst", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWHST_REG_DBG_SELECT, PSWHST_REG_DBG_DWORD_ENABLE, + PSWHST_REG_DBG_SHIFT, PSWHST_REG_DBG_FORCE_VALID, + PSWHST_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 0 +}; + +static struct block_defs block_pswhst2_defs = { + "pswhst2", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWHST2_REG_DBG_SELECT, PSWHST2_REG_DBG_DWORD_ENABLE, + PSWHST2_REG_DBG_SHIFT, PSWHST2_REG_DBG_FORCE_VALID, + PSWHST2_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 0 +}; + +static struct block_defs block_pswrd_defs = { + "pswrd", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWRD_REG_DBG_SELECT, PSWRD_REG_DBG_DWORD_ENABLE, + PSWRD_REG_DBG_SHIFT, PSWRD_REG_DBG_FORCE_VALID, + PSWRD_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 2 +}; + +static struct block_defs block_pswrd2_defs = { + "pswrd2", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWRD2_REG_DBG_SELECT, PSWRD2_REG_DBG_DWORD_ENABLE, + PSWRD2_REG_DBG_SHIFT, PSWRD2_REG_DBG_FORCE_VALID, + PSWRD2_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 2 +}; + +static struct block_defs block_pswwr_defs = { + "pswwr", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWWR_REG_DBG_SELECT, PSWWR_REG_DBG_DWORD_ENABLE, + PSWWR_REG_DBG_SHIFT, PSWWR_REG_DBG_FORCE_VALID, + PSWWR_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 3 +}; + +static struct block_defs block_pswwr2_defs = { + "pswwr2", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, false, DBG_RESET_REG_MISC_PL_HV, 3 +}; + +static struct block_defs block_pswrq_defs = { + "pswrq", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWRQ_REG_DBG_SELECT, PSWRQ_REG_DBG_DWORD_ENABLE, + PSWRQ_REG_DBG_SHIFT, PSWRQ_REG_DBG_FORCE_VALID, + PSWRQ_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 1 +}; + +static struct block_defs block_pswrq2_defs = { + "pswrq2", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PSWRQ2_REG_DBG_SELECT, PSWRQ2_REG_DBG_DWORD_ENABLE, + PSWRQ2_REG_DBG_SHIFT, PSWRQ2_REG_DBG_FORCE_VALID, + PSWRQ2_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISC_PL_HV, 1 +}; + +static struct block_defs block_pglcs_defs = { + "pglcs", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH}, + PGLCS_REG_DBG_SELECT, PGLCS_REG_DBG_DWORD_ENABLE, + PGLCS_REG_DBG_SHIFT, PGLCS_REG_DBG_FORCE_VALID, + PGLCS_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISCS_PL_HV, 2 +}; + +static struct block_defs block_ptu_defs = { + "ptu", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + PTU_REG_DBG_SELECT, PTU_REG_DBG_DWORD_ENABLE, + PTU_REG_DBG_SHIFT, PTU_REG_DBG_FORCE_VALID, + PTU_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 20 +}; + +static struct block_defs block_dmae_defs = { + "dmae", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + DMAE_REG_DBG_SELECT, DMAE_REG_DBG_DWORD_ENABLE, + DMAE_REG_DBG_SHIFT, DMAE_REG_DBG_FORCE_VALID, + DMAE_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 28 +}; + +static struct block_defs block_tcm_defs = { + "tcm", {true, true, true}, true, DBG_TSTORM_ID, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT}, + TCM_REG_DBG_SELECT, TCM_REG_DBG_DWORD_ENABLE, + TCM_REG_DBG_SHIFT, TCM_REG_DBG_FORCE_VALID, + TCM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 5 +}; + +static struct block_defs block_mcm_defs = { + "mcm", {true, true, true}, true, DBG_MSTORM_ID, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + MCM_REG_DBG_SELECT, MCM_REG_DBG_DWORD_ENABLE, + MCM_REG_DBG_SHIFT, MCM_REG_DBG_FORCE_VALID, + MCM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 3 +}; + +static struct block_defs block_ucm_defs = { + "ucm", {true, true, true}, true, DBG_USTORM_ID, + {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU}, + UCM_REG_DBG_SELECT, UCM_REG_DBG_DWORD_ENABLE, + UCM_REG_DBG_SHIFT, UCM_REG_DBG_FORCE_VALID, + UCM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 8 +}; + +static struct block_defs block_xcm_defs = { + "xcm", {true, true, true}, true, DBG_XSTORM_ID, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX}, + XCM_REG_DBG_SELECT, XCM_REG_DBG_DWORD_ENABLE, + XCM_REG_DBG_SHIFT, XCM_REG_DBG_FORCE_VALID, + XCM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 19 +}; + +static struct block_defs block_ycm_defs = { + "ycm", {true, true, true}, true, DBG_YSTORM_ID, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY}, + YCM_REG_DBG_SELECT, YCM_REG_DBG_DWORD_ENABLE, + YCM_REG_DBG_SHIFT, YCM_REG_DBG_FORCE_VALID, + YCM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 5 +}; + +static struct block_defs block_pcm_defs = { + "pcm", {true, true, true}, true, DBG_PSTORM_ID, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS}, + PCM_REG_DBG_SELECT, PCM_REG_DBG_DWORD_ENABLE, + PCM_REG_DBG_SHIFT, PCM_REG_DBG_FORCE_VALID, + PCM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 4 +}; + +static struct block_defs block_qm_defs = { + "qm", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCQ}, + QM_REG_DBG_SELECT, QM_REG_DBG_DWORD_ENABLE, + QM_REG_DBG_SHIFT, QM_REG_DBG_FORCE_VALID, + QM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 16 +}; + +static struct block_defs block_tm_defs = { + "tm", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS}, + TM_REG_DBG_SELECT, TM_REG_DBG_DWORD_ENABLE, + TM_REG_DBG_SHIFT, TM_REG_DBG_FORCE_VALID, + TM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 17 +}; + +static struct block_defs block_dorq_defs = { + "dorq", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY}, + DORQ_REG_DBG_SELECT, DORQ_REG_DBG_DWORD_ENABLE, + DORQ_REG_DBG_SHIFT, DORQ_REG_DBG_FORCE_VALID, + DORQ_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 18 +}; + +static struct block_defs block_brb_defs = { + "brb", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR}, + BRB_REG_DBG_SELECT, BRB_REG_DBG_DWORD_ENABLE, + BRB_REG_DBG_SHIFT, BRB_REG_DBG_FORCE_VALID, + BRB_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 0 +}; + +static struct block_defs block_src_defs = { + "src", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF}, + SRC_REG_DBG_SELECT, SRC_REG_DBG_DWORD_ENABLE, + SRC_REG_DBG_SHIFT, SRC_REG_DBG_FORCE_VALID, + SRC_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 2 +}; + +static struct block_defs block_prs_defs = { + "prs", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR}, + PRS_REG_DBG_SELECT, PRS_REG_DBG_DWORD_ENABLE, + PRS_REG_DBG_SHIFT, PRS_REG_DBG_FORCE_VALID, + PRS_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 1 +}; + +static struct block_defs block_tsdm_defs = { + "tsdm", {true, true, true}, true, DBG_TSTORM_ID, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT}, + TSDM_REG_DBG_SELECT, TSDM_REG_DBG_DWORD_ENABLE, + TSDM_REG_DBG_SHIFT, TSDM_REG_DBG_FORCE_VALID, + TSDM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 3 +}; + +static struct block_defs block_msdm_defs = { + "msdm", {true, true, true}, true, DBG_MSTORM_ID, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + MSDM_REG_DBG_SELECT, MSDM_REG_DBG_DWORD_ENABLE, + MSDM_REG_DBG_SHIFT, MSDM_REG_DBG_FORCE_VALID, + MSDM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 6 +}; + +static struct block_defs block_usdm_defs = { + "usdm", {true, true, true}, true, DBG_USTORM_ID, + {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU}, + USDM_REG_DBG_SELECT, USDM_REG_DBG_DWORD_ENABLE, + USDM_REG_DBG_SHIFT, USDM_REG_DBG_FORCE_VALID, + USDM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 7 +}; + +static struct block_defs block_xsdm_defs = { + "xsdm", {true, true, true}, true, DBG_XSTORM_ID, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX}, + XSDM_REG_DBG_SELECT, XSDM_REG_DBG_DWORD_ENABLE, + XSDM_REG_DBG_SHIFT, XSDM_REG_DBG_FORCE_VALID, + XSDM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 20 +}; + +static struct block_defs block_ysdm_defs = { + "ysdm", {true, true, true}, true, DBG_YSTORM_ID, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY}, + YSDM_REG_DBG_SELECT, YSDM_REG_DBG_DWORD_ENABLE, + YSDM_REG_DBG_SHIFT, YSDM_REG_DBG_FORCE_VALID, + YSDM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 8 +}; + +static struct block_defs block_psdm_defs = { + "psdm", {true, true, true}, true, DBG_PSTORM_ID, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS}, + PSDM_REG_DBG_SELECT, PSDM_REG_DBG_DWORD_ENABLE, + PSDM_REG_DBG_SHIFT, PSDM_REG_DBG_FORCE_VALID, + PSDM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 7 +}; + +static struct block_defs block_tsem_defs = { + "tsem", {true, true, true}, true, DBG_TSTORM_ID, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT}, + TSEM_REG_DBG_SELECT, TSEM_REG_DBG_DWORD_ENABLE, + TSEM_REG_DBG_SHIFT, TSEM_REG_DBG_FORCE_VALID, + TSEM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 4 +}; + +static struct block_defs block_msem_defs = { + "msem", {true, true, true}, true, DBG_MSTORM_ID, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + MSEM_REG_DBG_SELECT, MSEM_REG_DBG_DWORD_ENABLE, + MSEM_REG_DBG_SHIFT, MSEM_REG_DBG_FORCE_VALID, + MSEM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 9 +}; + +static struct block_defs block_usem_defs = { + "usem", {true, true, true}, true, DBG_USTORM_ID, + {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU}, + USEM_REG_DBG_SELECT, USEM_REG_DBG_DWORD_ENABLE, + USEM_REG_DBG_SHIFT, USEM_REG_DBG_FORCE_VALID, + USEM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 9 +}; + +static struct block_defs block_xsem_defs = { + "xsem", {true, true, true}, true, DBG_XSTORM_ID, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX}, + XSEM_REG_DBG_SELECT, XSEM_REG_DBG_DWORD_ENABLE, + XSEM_REG_DBG_SHIFT, XSEM_REG_DBG_FORCE_VALID, + XSEM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 21 +}; + +static struct block_defs block_ysem_defs = { + "ysem", {true, true, true}, true, DBG_YSTORM_ID, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY}, + YSEM_REG_DBG_SELECT, YSEM_REG_DBG_DWORD_ENABLE, + YSEM_REG_DBG_SHIFT, YSEM_REG_DBG_FORCE_VALID, + YSEM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 11 +}; + +static struct block_defs block_psem_defs = { + "psem", {true, true, true}, true, DBG_PSTORM_ID, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS}, + PSEM_REG_DBG_SELECT, PSEM_REG_DBG_DWORD_ENABLE, + PSEM_REG_DBG_SHIFT, PSEM_REG_DBG_FORCE_VALID, + PSEM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 10 +}; + +static struct block_defs block_rss_defs = { + "rss", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT}, + RSS_REG_DBG_SELECT, RSS_REG_DBG_DWORD_ENABLE, + RSS_REG_DBG_SHIFT, RSS_REG_DBG_FORCE_VALID, + RSS_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 18 +}; + +static struct block_defs block_tmld_defs = { + "tmld", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + TMLD_REG_DBG_SELECT, TMLD_REG_DBG_DWORD_ENABLE, + TMLD_REG_DBG_SHIFT, TMLD_REG_DBG_FORCE_VALID, + TMLD_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 13 +}; + +static struct block_defs block_muld_defs = { + "muld", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU}, + MULD_REG_DBG_SELECT, MULD_REG_DBG_DWORD_ENABLE, + MULD_REG_DBG_SHIFT, MULD_REG_DBG_FORCE_VALID, + MULD_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 14 +}; + +static struct block_defs block_yuld_defs = { + "yuld", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU}, + YULD_REG_DBG_SELECT, YULD_REG_DBG_DWORD_ENABLE, + YULD_REG_DBG_SHIFT, YULD_REG_DBG_FORCE_VALID, + YULD_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 15 +}; + +static struct block_defs block_xyld_defs = { + "xyld", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX}, + XYLD_REG_DBG_SELECT, XYLD_REG_DBG_DWORD_ENABLE, + XYLD_REG_DBG_SHIFT, XYLD_REG_DBG_FORCE_VALID, + XYLD_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 12 +}; + +static struct block_defs block_prm_defs = { + "prm", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + PRM_REG_DBG_SELECT, PRM_REG_DBG_DWORD_ENABLE, + PRM_REG_DBG_SHIFT, PRM_REG_DBG_FORCE_VALID, + PRM_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 21 +}; + +static struct block_defs block_pbf_pb1_defs = { + "pbf_pb1", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV}, + PBF_PB1_REG_DBG_SELECT, PBF_PB1_REG_DBG_DWORD_ENABLE, + PBF_PB1_REG_DBG_SHIFT, PBF_PB1_REG_DBG_FORCE_VALID, + PBF_PB1_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, + 11 +}; + +static struct block_defs block_pbf_pb2_defs = { + "pbf_pb2", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV}, + PBF_PB2_REG_DBG_SELECT, PBF_PB2_REG_DBG_DWORD_ENABLE, + PBF_PB2_REG_DBG_SHIFT, PBF_PB2_REG_DBG_FORCE_VALID, + PBF_PB2_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, + 12 +}; + +static struct block_defs block_rpb_defs = { + "rpb", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + RPB_REG_DBG_SELECT, RPB_REG_DBG_DWORD_ENABLE, + RPB_REG_DBG_SHIFT, RPB_REG_DBG_FORCE_VALID, + RPB_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 13 +}; + +static struct block_defs block_btb_defs = { + "btb", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCV}, + BTB_REG_DBG_SELECT, BTB_REG_DBG_DWORD_ENABLE, + BTB_REG_DBG_SHIFT, BTB_REG_DBG_FORCE_VALID, + BTB_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 10 +}; + +static struct block_defs block_pbf_defs = { + "pbf", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV}, + PBF_REG_DBG_SELECT, PBF_REG_DBG_DWORD_ENABLE, + PBF_REG_DBG_SHIFT, PBF_REG_DBG_FORCE_VALID, + PBF_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 15 +}; + +static struct block_defs block_rdif_defs = { + "rdif", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, + RDIF_REG_DBG_SELECT, RDIF_REG_DBG_DWORD_ENABLE, + RDIF_REG_DBG_SHIFT, RDIF_REG_DBG_FORCE_VALID, + RDIF_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 16 +}; + +static struct block_defs block_tdif_defs = { + "tdif", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS}, + TDIF_REG_DBG_SELECT, TDIF_REG_DBG_DWORD_ENABLE, + TDIF_REG_DBG_SHIFT, TDIF_REG_DBG_FORCE_VALID, + TDIF_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 17 +}; + +static struct block_defs block_cdu_defs = { + "cdu", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF}, + CDU_REG_DBG_SELECT, CDU_REG_DBG_DWORD_ENABLE, + CDU_REG_DBG_SHIFT, CDU_REG_DBG_FORCE_VALID, + CDU_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 23 +}; + +static struct block_defs block_ccfc_defs = { + "ccfc", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF}, + CCFC_REG_DBG_SELECT, CCFC_REG_DBG_DWORD_ENABLE, + CCFC_REG_DBG_SHIFT, CCFC_REG_DBG_FORCE_VALID, + CCFC_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 24 +}; + +static struct block_defs block_tcfc_defs = { + "tcfc", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF}, + TCFC_REG_DBG_SELECT, TCFC_REG_DBG_DWORD_ENABLE, + TCFC_REG_DBG_SHIFT, TCFC_REG_DBG_FORCE_VALID, + TCFC_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 25 +}; + +static struct block_defs block_igu_defs = { + "igu", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + IGU_REG_DBG_SELECT, IGU_REG_DBG_DWORD_ENABLE, + IGU_REG_DBG_SHIFT, IGU_REG_DBG_FORCE_VALID, + IGU_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 27 +}; + +static struct block_defs block_cau_defs = { + "cau", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP}, + CAU_REG_DBG_SELECT, CAU_REG_DBG_DWORD_ENABLE, + CAU_REG_DBG_SHIFT, CAU_REG_DBG_FORCE_VALID, + CAU_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 19 +}; + +static struct block_defs block_umac_defs = { + "umac", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ}, + UMAC_REG_DBG_SELECT, UMAC_REG_DBG_DWORD_ENABLE, + UMAC_REG_DBG_SHIFT, UMAC_REG_DBG_FORCE_VALID, + UMAC_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISCS_PL_HV, 6 +}; + +static struct block_defs block_xmac_defs = { + "xmac", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_dbg_defs = { + "dbg", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 3 +}; + +static struct block_defs block_nig_defs = { + "nig", {true, true, true}, false, 0, + {DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN}, + NIG_REG_DBG_SELECT, NIG_REG_DBG_DWORD_ENABLE, + NIG_REG_DBG_SHIFT, NIG_REG_DBG_FORCE_VALID, + NIG_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 0 +}; + +static struct block_defs block_wol_defs = { + "wol", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ}, + WOL_REG_DBG_SELECT, WOL_REG_DBG_DWORD_ENABLE, + WOL_REG_DBG_SHIFT, WOL_REG_DBG_FORCE_VALID, + WOL_REG_DBG_FORCE_FRAME, + true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 7 +}; + +static struct block_defs block_bmbn_defs = { + "bmbn", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCB}, + BMBN_REG_DBG_SELECT, BMBN_REG_DBG_DWORD_ENABLE, + BMBN_REG_DBG_SHIFT, BMBN_REG_DBG_FORCE_VALID, + BMBN_REG_DBG_FORCE_FRAME, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_ipc_defs = { + "ipc", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, false, DBG_RESET_REG_MISCS_PL_UA, 8 +}; + +static struct block_defs block_nwm_defs = { + "nwm", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW}, + NWM_REG_DBG_SELECT, NWM_REG_DBG_DWORD_ENABLE, + NWM_REG_DBG_SHIFT, NWM_REG_DBG_FORCE_VALID, + NWM_REG_DBG_FORCE_FRAME, + true, false, DBG_RESET_REG_MISCS_PL_HV_2, 0 +}; + +static struct block_defs block_nws_defs = { + "nws", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, false, DBG_RESET_REG_MISCS_PL_HV, 12 +}; + +static struct block_defs block_ms_defs = { + "ms", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, false, DBG_RESET_REG_MISCS_PL_HV, 13 +}; + +static struct block_defs block_phy_pcie_defs = { + "phy_pcie", {false, false, true}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH}, + PCIE_REG_DBG_COMMON_SELECT, PCIE_REG_DBG_COMMON_DWORD_ENABLE, + PCIE_REG_DBG_COMMON_SHIFT, PCIE_REG_DBG_COMMON_FORCE_VALID, + PCIE_REG_DBG_COMMON_FORCE_FRAME, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_led_defs = { + "led", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + true, true, DBG_RESET_REG_MISCS_PL_HV, 14 +}; + +static struct block_defs block_misc_aeu_defs = { + "misc_aeu", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs block_bar0_map_defs = { + "bar0_map", {false, false, false}, false, 0, + {MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS}, + 0, 0, 0, 0, 0, + false, false, MAX_DBG_RESET_REGS, 0 +}; + +static struct block_defs *s_block_defs[MAX_BLOCK_ID] = { + &block_grc_defs, + &block_miscs_defs, + &block_misc_defs, + &block_dbu_defs, + &block_pglue_b_defs, + &block_cnig_defs, + &block_cpmu_defs, + &block_ncsi_defs, + &block_opte_defs, + &block_bmb_defs, + &block_pcie_defs, + &block_mcp_defs, + &block_mcp2_defs, + &block_pswhst_defs, + &block_pswhst2_defs, + &block_pswrd_defs, + &block_pswrd2_defs, + &block_pswwr_defs, + &block_pswwr2_defs, + &block_pswrq_defs, + &block_pswrq2_defs, + &block_pglcs_defs, + &block_dmae_defs, + &block_ptu_defs, + &block_tcm_defs, + &block_mcm_defs, + &block_ucm_defs, + &block_xcm_defs, + &block_ycm_defs, + &block_pcm_defs, + &block_qm_defs, + &block_tm_defs, + &block_dorq_defs, + &block_brb_defs, + &block_src_defs, + &block_prs_defs, + &block_tsdm_defs, + &block_msdm_defs, + &block_usdm_defs, + &block_xsdm_defs, + &block_ysdm_defs, + &block_psdm_defs, + &block_tsem_defs, + &block_msem_defs, + &block_usem_defs, + &block_xsem_defs, + &block_ysem_defs, + &block_psem_defs, + &block_rss_defs, + &block_tmld_defs, + &block_muld_defs, + &block_yuld_defs, + &block_xyld_defs, + &block_prm_defs, + &block_pbf_pb1_defs, + &block_pbf_pb2_defs, + &block_rpb_defs, + &block_btb_defs, + &block_pbf_defs, + &block_rdif_defs, + &block_tdif_defs, + &block_cdu_defs, + &block_ccfc_defs, + &block_tcfc_defs, + &block_igu_defs, + &block_cau_defs, + &block_umac_defs, + &block_xmac_defs, + &block_dbg_defs, + &block_nig_defs, + &block_wol_defs, + &block_bmbn_defs, + &block_ipc_defs, + &block_nwm_defs, + &block_nws_defs, + &block_ms_defs, + &block_phy_pcie_defs, + &block_led_defs, + &block_misc_aeu_defs, + &block_bar0_map_defs, +}; + +static struct platform_defs s_platform_defs[] = { + {"asic", 1}, + {"reserved", 0}, + {"reserved2", 0}, + {"reserved3", 0} +}; + +static struct grc_param_defs s_grc_param_defs[] = { + {{1, 1, 1}, 0, 1, false, 1, 1}, /* DBG_GRC_PARAM_DUMP_TSTORM */ + {{1, 1, 1}, 0, 1, false, 1, 1}, /* DBG_GRC_PARAM_DUMP_MSTORM */ + {{1, 1, 1}, 0, 1, false, 1, 1}, /* DBG_GRC_PARAM_DUMP_USTORM */ + {{1, 1, 1}, 0, 1, false, 1, 1}, /* DBG_GRC_PARAM_DUMP_XSTORM */ + {{1, 1, 1}, 0, 1, false, 1, 1}, /* DBG_GRC_PARAM_DUMP_YSTORM */ + {{1, 1, 1}, 0, 1, false, 1, 1}, /* DBG_GRC_PARAM_DUMP_PSTORM */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_REGS */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_RAM */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_PBUF */ + {{0, 0, 0}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_IOR */ + {{0, 0, 0}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_VFC */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_CM_CTX */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_ILT */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_RSS */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_CAU */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_QM */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_MCP */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_RESERVED */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_CFC */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_IGU */ + {{0, 0, 0}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_BRB */ + {{0, 0, 0}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_BTB */ + {{0, 0, 0}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_BMB */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_NIG */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_MULD */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_PRS */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_DMAE */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_TM */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_SDM */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_DIF */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_STATIC */ + {{0, 0, 0}, 0, 1, false, 0, 0}, /* DBG_GRC_PARAM_UNSTALL */ + {{MAX_LCIDS, MAX_LCIDS, MAX_LCIDS}, 1, MAX_LCIDS, false, MAX_LCIDS, + MAX_LCIDS}, /* DBG_GRC_PARAM_NUM_LCIDS */ + {{MAX_LTIDS, MAX_LTIDS, MAX_LTIDS}, 1, MAX_LTIDS, false, MAX_LTIDS, + MAX_LTIDS}, /* DBG_GRC_PARAM_NUM_LTIDS */ + {{0, 0, 0}, 0, 1, true, 0, 0}, /* DBG_GRC_PARAM_EXCLUDE_ALL */ + {{0, 0, 0}, 0, 1, true, 0, 0}, /* DBG_GRC_PARAM_CRASH */ + {{0, 0, 0}, 0, 1, false, 1, 0}, /* DBG_GRC_PARAM_PARITY_SAFE */ + {{1, 1, 1}, 0, 1, false, 0, 1}, /* DBG_GRC_PARAM_DUMP_CM */ + {{1, 1, 1}, 0, 1, false, 0, 1} /* DBG_GRC_PARAM_DUMP_PHY */ +}; + +static struct rss_mem_defs s_rss_mem_defs[] = { + { "rss_mem_cid", "rss_cid", 0, + {256, 256, 320}, + {32, 32, 32} }, + { "rss_mem_key_msb", "rss_key", 1024, + {128, 128, 208}, + {256, 256, 256} }, + { "rss_mem_key_lsb", "rss_key", 2048, + {128, 128, 208}, + {64, 64, 64} }, + { "rss_mem_info", "rss_info", 3072, + {128, 128, 208}, + {16, 16, 16} }, + { "rss_mem_ind", "rss_ind", 4096, + {(128 * 128), (128 * 128), (128 * 208)}, + {16, 16, 16} } +}; + +static struct vfc_ram_defs s_vfc_ram_defs[] = { + {"vfc_ram_tt1", "vfc_ram", 0, 512}, + {"vfc_ram_mtt2", "vfc_ram", 512, 128}, + {"vfc_ram_stt2", "vfc_ram", 640, 32}, + {"vfc_ram_ro_vect", "vfc_ram", 672, 32} +}; + +static struct big_ram_defs s_big_ram_defs[] = { + { "BRB", MEM_GROUP_BRB_MEM, MEM_GROUP_BRB_RAM, DBG_GRC_PARAM_DUMP_BRB, + BRB_REG_BIG_RAM_ADDRESS, BRB_REG_BIG_RAM_DATA, + {4800, 4800, 5632} }, + { "BTB", MEM_GROUP_BTB_MEM, MEM_GROUP_BTB_RAM, DBG_GRC_PARAM_DUMP_BTB, + BTB_REG_BIG_RAM_ADDRESS, BTB_REG_BIG_RAM_DATA, + {2880, 2880, 3680} }, + { "BMB", MEM_GROUP_BMB_MEM, MEM_GROUP_BMB_RAM, DBG_GRC_PARAM_DUMP_BMB, + BMB_REG_BIG_RAM_ADDRESS, BMB_REG_BIG_RAM_DATA, + {1152, 1152, 1152} } +}; + +static struct reset_reg_defs s_reset_regs_defs[] = { + { MISCS_REG_RESET_PL_UA, 0x0, + {true, true, true} }, /* DBG_RESET_REG_MISCS_PL_UA */ + { MISCS_REG_RESET_PL_HV, 0x0, + {true, true, true} }, /* DBG_RESET_REG_MISCS_PL_HV */ + { MISCS_REG_RESET_PL_HV_2, 0x0, + {false, false, true} }, /* DBG_RESET_REG_MISCS_PL_HV_2 */ + { MISC_REG_RESET_PL_UA, 0x0, + {true, true, true} }, /* DBG_RESET_REG_MISC_PL_UA */ + { MISC_REG_RESET_PL_HV, 0x0, + {true, true, true} }, /* DBG_RESET_REG_MISC_PL_HV */ + { MISC_REG_RESET_PL_PDA_VMAIN_1, 0x4404040, + {true, true, true} }, /* DBG_RESET_REG_MISC_PL_PDA_VMAIN_1 */ + { MISC_REG_RESET_PL_PDA_VMAIN_2, 0x7c00007, + {true, true, true} }, /* DBG_RESET_REG_MISC_PL_PDA_VMAIN_2 */ + { MISC_REG_RESET_PL_PDA_VAUX, 0x2, + {true, true, true} }, /* DBG_RESET_REG_MISC_PL_PDA_VAUX */ +}; + +static struct phy_defs s_phy_defs[] = { + {"nw_phy", NWS_REG_NWS_CMU, PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_7_0, + PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_15_8, + PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_7_0, + PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_11_8}, + {"sgmii_phy", MS_REG_MS_CMU, PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X132, + PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X133, + PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X130, + PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X131}, + {"pcie_phy0", PHY_PCIE_REG_PHY0, PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132, + PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133, + PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130, + PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131}, + {"pcie_phy1", PHY_PCIE_REG_PHY1, PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132, + PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133, + PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130, + PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131}, +}; + +/**************************** Private Functions ******************************/ + +/* Reads and returns a single dword from the specified unaligned buffer */ +static u32 qed_read_unaligned_dword(u8 *buf) +{ + u32 dword; + + memcpy((u8 *)&dword, buf, sizeof(dword)); + return dword; +} + +/* Initializes debug data for the specified device */ +static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + + if (dev_data->initialized) + return DBG_STATUS_OK; + + if (QED_IS_K2(p_hwfn->cdev)) { + dev_data->chip_id = CHIP_K2; + dev_data->mode_enable[MODE_K2] = 1; + } else if (QED_IS_BB_B0(p_hwfn->cdev)) { + dev_data->chip_id = CHIP_BB_B0; + dev_data->mode_enable[MODE_BB_B0] = 1; + } else { + return DBG_STATUS_UNKNOWN_CHIP; + } + + dev_data->platform_id = PLATFORM_ASIC; + dev_data->mode_enable[MODE_ASIC] = 1; + dev_data->initialized = true; + return DBG_STATUS_OK; +} + +/* Reads the FW info structure for the specified Storm from the chip, + * and writes it to the specified fw_info pointer. + */ +static void qed_read_fw_info(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u8 storm_id, struct fw_info *fw_info) +{ + /* Read first the address that points to fw_info location. + * The address is located in the last line of the Storm RAM. + */ + u32 addr = s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_INT_RAM + + DWORDS_TO_BYTES(SEM_FAST_REG_INT_RAM_SIZE) - + sizeof(struct fw_info_location); + struct fw_info_location fw_info_location; + u32 *dest = (u32 *)&fw_info_location; + u32 i; + + memset(&fw_info_location, 0, sizeof(fw_info_location)); + memset(fw_info, 0, sizeof(*fw_info)); + for (i = 0; i < BYTES_TO_DWORDS(sizeof(fw_info_location)); + i++, addr += BYTES_IN_DWORD) + dest[i] = qed_rd(p_hwfn, p_ptt, addr); + if (fw_info_location.size > 0 && fw_info_location.size <= + sizeof(*fw_info)) { + /* Read FW version info from Storm RAM */ + addr = fw_info_location.grc_addr; + dest = (u32 *)fw_info; + for (i = 0; i < BYTES_TO_DWORDS(fw_info_location.size); + i++, addr += BYTES_IN_DWORD) + dest[i] = qed_rd(p_hwfn, p_ptt, addr); + } +} + +/* Dumps the specified string to the specified buffer. Returns the dumped size + * in bytes (actual length + 1 for the null character termination). + */ +static u32 qed_dump_str(char *dump_buf, bool dump, const char *str) +{ + if (dump) + strcpy(dump_buf, str); + return (u32)strlen(str) + 1; +} + +/* Dumps zeros to align the specified buffer to dwords. Returns the dumped size + * in bytes. + */ +static u32 qed_dump_align(char *dump_buf, bool dump, u32 byte_offset) +{ + u8 offset_in_dword = (u8)(byte_offset & 0x3), align_size; + + align_size = offset_in_dword ? BYTES_IN_DWORD - offset_in_dword : 0; + + if (dump && align_size) + memset(dump_buf, 0, align_size); + return align_size; +} + +/* Writes the specified string param to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_str_param(u32 *dump_buf, + bool dump, + const char *param_name, const char *param_val) +{ + char *char_buf = (char *)dump_buf; + u32 offset = 0; + + /* Dump param name */ + offset += qed_dump_str(char_buf + offset, dump, param_name); + + /* Indicate a string param value */ + if (dump) + *(char_buf + offset) = 1; + offset++; + + /* Dump param value */ + offset += qed_dump_str(char_buf + offset, dump, param_val); + + /* Align buffer to next dword */ + offset += qed_dump_align(char_buf + offset, dump, offset); + return BYTES_TO_DWORDS(offset); +} + +/* Writes the specified numeric param to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_num_param(u32 *dump_buf, + bool dump, const char *param_name, u32 param_val) +{ + char *char_buf = (char *)dump_buf; + u32 offset = 0; + + /* Dump param name */ + offset += qed_dump_str(char_buf + offset, dump, param_name); + + /* Indicate a numeric param value */ + if (dump) + *(char_buf + offset) = 0; + offset++; + + /* Align buffer to next dword */ + offset += qed_dump_align(char_buf + offset, dump, offset); + + /* Dump param value (and change offset from bytes to dwords) */ + offset = BYTES_TO_DWORDS(offset); + if (dump) + *(dump_buf + offset) = param_val; + offset++; + return offset; +} + +/* Reads the FW version and writes it as a param to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_fw_ver_param(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + char fw_ver_str[16] = EMPTY_FW_VERSION_STR; + char fw_img_str[16] = EMPTY_FW_IMAGE_STR; + struct fw_info fw_info = { {0}, {0} }; + int printed_chars; + u32 offset = 0; + + if (dump) { + /* Read FW image/version from PRAM in a non-reset SEMI */ + bool found = false; + u8 storm_id; + + for (storm_id = 0; storm_id < MAX_DBG_STORMS && !found; + storm_id++) { + /* Read FW version/image */ + if (!dev_data->block_in_reset + [s_storm_defs[storm_id].block_id]) { + /* read FW info for the current Storm */ + qed_read_fw_info(p_hwfn, + p_ptt, storm_id, &fw_info); + + /* Create FW version/image strings */ + printed_chars = + snprintf(fw_ver_str, + sizeof(fw_ver_str), + "%d_%d_%d_%d", + fw_info.ver.num.major, + fw_info.ver.num.minor, + fw_info.ver.num.rev, + fw_info.ver.num.eng); + if (printed_chars < 0 || printed_chars >= + sizeof(fw_ver_str)) + DP_NOTICE(p_hwfn, + "Unexpected debug error: invalid FW version string\n"); + switch (fw_info.ver.image_id) { + case FW_IMG_MAIN: + strcpy(fw_img_str, "main"); + break; + default: + strcpy(fw_img_str, "unknown"); + break; + } + + found = true; + } + } + } + + /* Dump FW version, image and timestamp */ + offset += qed_dump_str_param(dump_buf + offset, + dump, "fw-version", fw_ver_str); + offset += qed_dump_str_param(dump_buf + offset, + dump, "fw-image", fw_img_str); + offset += qed_dump_num_param(dump_buf + offset, + dump, + "fw-timestamp", fw_info.ver.timestamp); + return offset; +} + +/* Reads the MFW version and writes it as a param to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_mfw_ver_param(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump) +{ + char mfw_ver_str[16] = EMPTY_FW_VERSION_STR; + + if (dump) { + u32 global_section_offsize, global_section_addr, mfw_ver; + u32 public_data_addr, global_section_offsize_addr; + int printed_chars; + + /* Find MCP public data GRC address. + * Needs to be ORed with MCP_REG_SCRATCH due to a HW bug. + */ + public_data_addr = qed_rd(p_hwfn, p_ptt, + MISC_REG_SHARED_MEM_ADDR) | + MCP_REG_SCRATCH; + + /* Find MCP public global section offset */ + global_section_offsize_addr = public_data_addr + + offsetof(struct mcp_public_data, + sections) + + sizeof(offsize_t) * PUBLIC_GLOBAL; + global_section_offsize = qed_rd(p_hwfn, p_ptt, + global_section_offsize_addr); + global_section_addr = MCP_REG_SCRATCH + + (global_section_offsize & + OFFSIZE_OFFSET_MASK) * 4; + + /* Read MFW version from MCP public global section */ + mfw_ver = qed_rd(p_hwfn, p_ptt, + global_section_addr + + offsetof(struct public_global, mfw_ver)); + + /* Dump MFW version param */ + printed_chars = snprintf(mfw_ver_str, sizeof(mfw_ver_str), + "%d_%d_%d_%d", + (u8) (mfw_ver >> 24), + (u8) (mfw_ver >> 16), + (u8) (mfw_ver >> 8), + (u8) mfw_ver); + if (printed_chars < 0 || printed_chars >= sizeof(mfw_ver_str)) + DP_NOTICE(p_hwfn, + "Unexpected debug error: invalid MFW version string\n"); + } + + return qed_dump_str_param(dump_buf, dump, "mfw-version", mfw_ver_str); +} + +/* Writes a section header to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_section_hdr(u32 *dump_buf, + bool dump, const char *name, u32 num_params) +{ + return qed_dump_num_param(dump_buf, dump, name, num_params); +} + +/* Writes the common global params to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_common_global_params(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, + u8 num_specific_global_params) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 offset = 0; + + /* Find platform string and dump global params section header */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, + "global_params", + NUM_COMMON_GLOBAL_PARAMS + + num_specific_global_params); + + /* Store params */ + offset += qed_dump_fw_ver_param(p_hwfn, p_ptt, dump_buf + offset, dump); + offset += qed_dump_mfw_ver_param(p_hwfn, + p_ptt, dump_buf + offset, dump); + offset += qed_dump_num_param(dump_buf + offset, + dump, "tools-version", TOOLS_VERSION); + offset += qed_dump_str_param(dump_buf + offset, + dump, + "chip", + s_chip_defs[dev_data->chip_id].name); + offset += qed_dump_str_param(dump_buf + offset, + dump, + "platform", + s_platform_defs[dev_data->platform_id]. + name); + offset += + qed_dump_num_param(dump_buf + offset, dump, "pci-func", + p_hwfn->abs_pf_id); + return offset; +} + +/* Writes the last section to the specified buffer at the given offset. + * Returns the dumped size in dwords. + */ +static u32 qed_dump_last_section(u32 *dump_buf, u32 offset, bool dump) +{ + u32 start_offset = offset, crc = ~0; + + /* Dump CRC section header */ + offset += qed_dump_section_hdr(dump_buf + offset, dump, "last", 0); + + /* Calculate CRC32 and add it to the dword following the "last" section. + */ + if (dump) + *(dump_buf + offset) = ~crc32(crc, (u8 *)dump_buf, + DWORDS_TO_BYTES(offset)); + offset++; + return offset - start_offset; +} + +/* Update blocks reset state */ +static void qed_update_blocks_reset_state(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 reg_val[MAX_DBG_RESET_REGS] = { 0 }; + u32 i; + + /* Read reset registers */ + for (i = 0; i < MAX_DBG_RESET_REGS; i++) + if (s_reset_regs_defs[i].exists[dev_data->chip_id]) + reg_val[i] = qed_rd(p_hwfn, + p_ptt, s_reset_regs_defs[i].addr); + + /* Check if blocks are in reset */ + for (i = 0; i < MAX_BLOCK_ID; i++) + dev_data->block_in_reset[i] = + s_block_defs[i]->has_reset_bit && + !(reg_val[s_block_defs[i]->reset_reg] & + BIT(s_block_defs[i]->reset_bit_offset)); +} + +/* Enable / disable the Debug block */ +static void qed_bus_enable_dbg_block(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, bool enable) +{ + qed_wr(p_hwfn, p_ptt, DBG_REG_DBG_BLOCK_ON, enable ? 1 : 0); +} + +/* Resets the Debug block */ +static void qed_bus_reset_dbg_block(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt) +{ + u32 dbg_reset_reg_addr, old_reset_reg_val, new_reset_reg_val; + + dbg_reset_reg_addr = + s_reset_regs_defs[s_block_defs[BLOCK_DBG]->reset_reg].addr; + old_reset_reg_val = qed_rd(p_hwfn, p_ptt, dbg_reset_reg_addr); + new_reset_reg_val = old_reset_reg_val & + ~BIT(s_block_defs[BLOCK_DBG]->reset_bit_offset); + + qed_wr(p_hwfn, p_ptt, dbg_reset_reg_addr, new_reset_reg_val); + qed_wr(p_hwfn, p_ptt, dbg_reset_reg_addr, old_reset_reg_val); +} + +static void qed_bus_set_framing_mode(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + enum dbg_bus_frame_modes mode) +{ + qed_wr(p_hwfn, p_ptt, DBG_REG_FRAMING_MODE, (u8)mode); +} + +/* Enable / disable Debug Bus clients according to the specified mask. + * (1 = enable, 0 = disable) + */ +static void qed_bus_enable_clients(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 client_mask) +{ + qed_wr(p_hwfn, p_ptt, DBG_REG_CLIENT_ENABLE, client_mask); +} + +static bool qed_is_mode_match(struct qed_hwfn *p_hwfn, u16 *modes_buf_offset) +{ + const u32 *ptr = s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr; + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u8 tree_val = ((u8 *)ptr)[(*modes_buf_offset)++]; + bool arg1, arg2; + + switch (tree_val) { + case INIT_MODE_OP_NOT: + return !qed_is_mode_match(p_hwfn, modes_buf_offset); + case INIT_MODE_OP_OR: + case INIT_MODE_OP_AND: + arg1 = qed_is_mode_match(p_hwfn, modes_buf_offset); + arg2 = qed_is_mode_match(p_hwfn, modes_buf_offset); + return (tree_val == INIT_MODE_OP_OR) ? (arg1 || + arg2) : (arg1 && arg2); + default: + return dev_data->mode_enable[tree_val - MAX_INIT_MODE_OPS] > 0; + } +} + +/* Returns the value of the specified GRC param */ +static u32 qed_grc_get_param(struct qed_hwfn *p_hwfn, + enum dbg_grc_params grc_param) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + + return dev_data->grc.param_val[grc_param]; +} + +/* Clear all GRC params */ +static void qed_dbg_grc_clear_params(struct qed_hwfn *p_hwfn) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 i; + + for (i = 0; i < MAX_DBG_GRC_PARAMS; i++) + dev_data->grc.param_set_by_user[i] = 0; +} + +/* Assign default GRC param values */ +static void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 i; + + for (i = 0; i < MAX_DBG_GRC_PARAMS; i++) + if (!dev_data->grc.param_set_by_user[i]) + dev_data->grc.param_val[i] = + s_grc_param_defs[i].default_val[dev_data->chip_id]; +} + +/* Returns true if the specified entity (indicated by GRC param) should be + * included in the dump, false otherwise. + */ +static bool qed_grc_is_included(struct qed_hwfn *p_hwfn, + enum dbg_grc_params grc_param) +{ + return qed_grc_get_param(p_hwfn, grc_param) > 0; +} + +/* Returns true of the specified Storm should be included in the dump, false + * otherwise. + */ +static bool qed_grc_is_storm_included(struct qed_hwfn *p_hwfn, + enum dbg_storms storm) +{ + return qed_grc_get_param(p_hwfn, (enum dbg_grc_params)storm) > 0; +} + +/* Returns true if the specified memory should be included in the dump, false + * otherwise. + */ +static bool qed_grc_is_mem_included(struct qed_hwfn *p_hwfn, + enum block_id block_id, u8 mem_group_id) +{ + u8 i; + + /* Check Storm match */ + if (s_block_defs[block_id]->associated_to_storm && + !qed_grc_is_storm_included(p_hwfn, + (enum dbg_storms)s_block_defs[block_id]->storm_id)) + return false; + + for (i = 0; i < NUM_BIG_RAM_TYPES; i++) + if (mem_group_id == s_big_ram_defs[i].mem_group_id || + mem_group_id == s_big_ram_defs[i].ram_mem_group_id) + return qed_grc_is_included(p_hwfn, + s_big_ram_defs[i].grc_param); + if (mem_group_id == MEM_GROUP_PXP_ILT || mem_group_id == + MEM_GROUP_PXP_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PXP); + if (mem_group_id == MEM_GROUP_RAM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_RAM); + if (mem_group_id == MEM_GROUP_PBUF) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PBUF); + if (mem_group_id == MEM_GROUP_CAU_MEM || + mem_group_id == MEM_GROUP_CAU_SB || + mem_group_id == MEM_GROUP_CAU_PI) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CAU); + if (mem_group_id == MEM_GROUP_QM_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_QM); + if (mem_group_id == MEM_GROUP_CONN_CFC_MEM || + mem_group_id == MEM_GROUP_TASK_CFC_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CFC); + if (mem_group_id == MEM_GROUP_IGU_MEM || mem_group_id == + MEM_GROUP_IGU_MSIX) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IGU); + if (mem_group_id == MEM_GROUP_MULD_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_MULD); + if (mem_group_id == MEM_GROUP_PRS_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PRS); + if (mem_group_id == MEM_GROUP_DMAE_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_DMAE); + if (mem_group_id == MEM_GROUP_TM_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_TM); + if (mem_group_id == MEM_GROUP_SDM_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_SDM); + if (mem_group_id == MEM_GROUP_TDIF_CTX || mem_group_id == + MEM_GROUP_RDIF_CTX) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_DIF); + if (mem_group_id == MEM_GROUP_CM_MEM) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM); + if (mem_group_id == MEM_GROUP_IOR) + return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IOR); + + return true; +} + +/* Stalls all Storms */ +static void qed_grc_stall_storms(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, bool stall) +{ + u8 reg_val = stall ? 1 : 0; + u8 storm_id; + + for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) { + if (qed_grc_is_storm_included(p_hwfn, + (enum dbg_storms)storm_id)) { + u32 reg_addr = + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_STALL_0; + + qed_wr(p_hwfn, p_ptt, reg_addr, reg_val); + } + } + + msleep(STALL_DELAY_MS); +} + +/* Takes all blocks out of reset */ +static void qed_grc_unreset_blocks(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 reg_val[MAX_DBG_RESET_REGS] = { 0 }; + u32 i; + + /* Fill reset regs values */ + for (i = 0; i < MAX_BLOCK_ID; i++) + if (s_block_defs[i]->has_reset_bit && s_block_defs[i]->unreset) + reg_val[s_block_defs[i]->reset_reg] |= + BIT(s_block_defs[i]->reset_bit_offset); + + /* Write reset registers */ + for (i = 0; i < MAX_DBG_RESET_REGS; i++) { + if (s_reset_regs_defs[i].exists[dev_data->chip_id]) { + reg_val[i] |= s_reset_regs_defs[i].unreset_val; + if (reg_val[i]) + qed_wr(p_hwfn, + p_ptt, + s_reset_regs_defs[i].addr + + RESET_REG_UNRESET_OFFSET, reg_val[i]); + } + } +} + +/* Returns the attention name offsets of the specified block */ +static const struct dbg_attn_block_type_data * +qed_get_block_attn_data(enum block_id block_id, enum dbg_attn_type attn_type) +{ + const struct dbg_attn_block *base_attn_block_arr = + (const struct dbg_attn_block *) + s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr; + + return &base_attn_block_arr[block_id].per_type_data[attn_type]; +} + +/* Returns the attention registers of the specified block */ +static const struct dbg_attn_reg * +qed_get_block_attn_regs(enum block_id block_id, enum dbg_attn_type attn_type, + u8 *num_attn_regs) +{ + const struct dbg_attn_block_type_data *block_type_data = + qed_get_block_attn_data(block_id, attn_type); + + *num_attn_regs = block_type_data->num_regs; + return &((const struct dbg_attn_reg *) + s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr)[block_type_data-> + regs_offset]; +} + +/* For each block, clear the status of all parities */ +static void qed_grc_clear_all_prty(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u8 reg_idx, num_attn_regs; + u32 block_id; + + for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) { + const struct dbg_attn_reg *attn_reg_arr; + + if (dev_data->block_in_reset[block_id]) + continue; + + attn_reg_arr = qed_get_block_attn_regs((enum block_id)block_id, + ATTN_TYPE_PARITY, + &num_attn_regs); + for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) { + const struct dbg_attn_reg *reg_data = + &attn_reg_arr[reg_idx]; + + /* Check mode */ + bool eval_mode = GET_FIELD(reg_data->mode.data, + DBG_MODE_HDR_EVAL_MODE) > 0; + u16 modes_buf_offset = + GET_FIELD(reg_data->mode.data, + DBG_MODE_HDR_MODES_BUF_OFFSET); + + if (!eval_mode || + qed_is_mode_match(p_hwfn, &modes_buf_offset)) + /* Mode match - read parity status read-clear + * register. + */ + qed_rd(p_hwfn, p_ptt, + DWORDS_TO_BYTES(reg_data-> + sts_clr_address)); + } + } +} + +/* Dumps GRC registers section header. Returns the dumped size in dwords. + * The following parameters are dumped: + * - 'count' = num_dumped_entries + * - 'split' = split_type + * - 'id'i = split_id (dumped only if split_id >= 0) + * - 'param_name' = param_val (user param, dumped only if param_name != NULL and + * param_val != NULL) + */ +static u32 qed_grc_dump_regs_hdr(u32 *dump_buf, + bool dump, + u32 num_reg_entries, + const char *split_type, + int split_id, + const char *param_name, const char *param_val) +{ + u8 num_params = 2 + (split_id >= 0 ? 1 : 0) + (param_name ? 1 : 0); + u32 offset = 0; + + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "grc_regs", num_params); + offset += qed_dump_num_param(dump_buf + offset, + dump, "count", num_reg_entries); + offset += qed_dump_str_param(dump_buf + offset, + dump, "split", split_type); + if (split_id >= 0) + offset += qed_dump_num_param(dump_buf + offset, + dump, "id", split_id); + if (param_name && param_val) + offset += qed_dump_str_param(dump_buf + offset, + dump, param_name, param_val); + return offset; +} + +/* Dumps GRC register/memory. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, + bool dump, u32 addr, u32 len) +{ + u32 offset = 0, i; + + if (dump) { + *(dump_buf + offset++) = addr | (len << REG_DUMP_LEN_SHIFT); + for (i = 0; i < len; i++, addr++, offset++) + *(dump_buf + offset) = qed_rd(p_hwfn, + p_ptt, + DWORDS_TO_BYTES(addr)); + } else { + offset += len + 1; + } + + return offset; +} + +/* Dumps GRC registers entries. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct dbg_array input_regs_arr, + u32 *dump_buf, + bool dump, + bool block_enable[MAX_BLOCK_ID], + u32 *num_dumped_reg_entries) +{ + u32 i, offset = 0, input_offset = 0; + bool mode_match = true; + + *num_dumped_reg_entries = 0; + while (input_offset < input_regs_arr.size_in_dwords) { + const struct dbg_dump_cond_hdr *cond_hdr = + (const struct dbg_dump_cond_hdr *) + &input_regs_arr.ptr[input_offset++]; + bool eval_mode = GET_FIELD(cond_hdr->mode.data, + DBG_MODE_HDR_EVAL_MODE) > 0; + + /* Check mode/block */ + if (eval_mode) { + u16 modes_buf_offset = + GET_FIELD(cond_hdr->mode.data, + DBG_MODE_HDR_MODES_BUF_OFFSET); + mode_match = qed_is_mode_match(p_hwfn, + &modes_buf_offset); + } + + if (mode_match && block_enable[cond_hdr->block_id]) { + for (i = 0; i < cond_hdr->data_size; + i++, input_offset++) { + const struct dbg_dump_reg *reg = + (const struct dbg_dump_reg *) + &input_regs_arr.ptr[input_offset]; + + offset += + qed_grc_dump_reg_entry(p_hwfn, p_ptt, + dump_buf + offset, dump, + GET_FIELD(reg->data, + DBG_DUMP_REG_ADDRESS), + GET_FIELD(reg->data, + DBG_DUMP_REG_LENGTH)); + (*num_dumped_reg_entries)++; + } + } else { + input_offset += cond_hdr->data_size; + } + } + + return offset; +} + +/* Dumps GRC registers entries. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct dbg_array input_regs_arr, + u32 *dump_buf, + bool dump, + bool block_enable[MAX_BLOCK_ID], + const char *split_type_name, + u32 split_id, + const char *param_name, + const char *param_val) +{ + u32 num_dumped_reg_entries, offset; + + /* Calculate register dump header size (and skip it for now) */ + offset = qed_grc_dump_regs_hdr(dump_buf, + false, + 0, + split_type_name, + split_id, param_name, param_val); + + /* Dump registers */ + offset += qed_grc_dump_regs_entries(p_hwfn, + p_ptt, + input_regs_arr, + dump_buf + offset, + dump, + block_enable, + &num_dumped_reg_entries); + + /* Write register dump header */ + if (dump && num_dumped_reg_entries > 0) + qed_grc_dump_regs_hdr(dump_buf, + dump, + num_dumped_reg_entries, + split_type_name, + split_id, param_name, param_val); + + return num_dumped_reg_entries > 0 ? offset : 0; +} + +/* Dumps registers according to the input registers array. + * Returns the dumped size in dwords. + */ +static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, + bool block_enable[MAX_BLOCK_ID], + const char *param_name, const char *param_val) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 offset = 0, input_offset = 0; + u8 port_id, pf_id; + + if (dump) + DP_VERBOSE(p_hwfn, QED_MSG_DEBUG, "Dumping registers...\n"); + while (input_offset < + s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].size_in_dwords) { + const struct dbg_dump_split_hdr *split_hdr = + (const struct dbg_dump_split_hdr *) + &s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset++]; + u8 split_type_id = GET_FIELD(split_hdr->hdr, + DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID); + u32 split_data_size = GET_FIELD(split_hdr->hdr, + DBG_DUMP_SPLIT_HDR_DATA_SIZE); + struct dbg_array curr_input_regs_arr = { + &s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset], + split_data_size}; + + switch (split_type_id) { + case SPLIT_TYPE_NONE: + case SPLIT_TYPE_VF: + offset += qed_grc_dump_split_data(p_hwfn, + p_ptt, + curr_input_regs_arr, + dump_buf + offset, + dump, + block_enable, + "eng", + (u32)(-1), + param_name, + param_val); + break; + case SPLIT_TYPE_PORT: + for (port_id = 0; + port_id < + s_chip_defs[dev_data->chip_id]. + per_platform[dev_data->platform_id].num_ports; + port_id++) { + if (dump) + qed_port_pretend(p_hwfn, p_ptt, + port_id); + offset += + qed_grc_dump_split_data(p_hwfn, p_ptt, + curr_input_regs_arr, + dump_buf + offset, + dump, block_enable, + "port", port_id, + param_name, + param_val); + } + break; + case SPLIT_TYPE_PF: + case SPLIT_TYPE_PORT_PF: + for (pf_id = 0; + pf_id < + s_chip_defs[dev_data->chip_id]. + per_platform[dev_data->platform_id].num_pfs; + pf_id++) { + if (dump) + qed_fid_pretend(p_hwfn, p_ptt, pf_id); + offset += qed_grc_dump_split_data(p_hwfn, + p_ptt, + curr_input_regs_arr, + dump_buf + offset, + dump, block_enable, + "pf", pf_id, param_name, + param_val); + } + break; + default: + break; + } + + input_offset += split_data_size; + } + + /* Pretend to original PF */ + if (dump) + qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id); + return offset; +} + +/* Dump reset registers. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 i, offset = 0, num_regs = 0; + + /* Calculate header size */ + offset += qed_grc_dump_regs_hdr(dump_buf, + false, 0, "eng", -1, NULL, NULL); + + /* Write reset registers */ + for (i = 0; i < MAX_DBG_RESET_REGS; i++) { + if (s_reset_regs_defs[i].exists[dev_data->chip_id]) { + offset += qed_grc_dump_reg_entry(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + BYTES_TO_DWORDS + (s_reset_regs_defs + [i].addr), 1); + num_regs++; + } + } + + /* Write header */ + if (dump) + qed_grc_dump_regs_hdr(dump_buf, + true, num_regs, "eng", -1, NULL, NULL); + return offset; +} + +/* Dump registers that are modified during GRC Dump and therefore must be dumped + * first. Returns the dumped size in dwords. + */ +static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 offset = 0, num_reg_entries = 0, block_id; + u8 storm_id, reg_idx, num_attn_regs; + + /* Calculate header size */ + offset += qed_grc_dump_regs_hdr(dump_buf, + false, 0, "eng", -1, NULL, NULL); + + /* Write parity registers */ + for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) { + const struct dbg_attn_reg *attn_reg_arr; + + if (dev_data->block_in_reset[block_id] && dump) + continue; + + attn_reg_arr = qed_get_block_attn_regs((enum block_id)block_id, + ATTN_TYPE_PARITY, + &num_attn_regs); + for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) { + const struct dbg_attn_reg *reg_data = + &attn_reg_arr[reg_idx]; + u16 modes_buf_offset; + bool eval_mode; + + /* Check mode */ + eval_mode = GET_FIELD(reg_data->mode.data, + DBG_MODE_HDR_EVAL_MODE) > 0; + modes_buf_offset = + GET_FIELD(reg_data->mode.data, + DBG_MODE_HDR_MODES_BUF_OFFSET); + if (!eval_mode || + qed_is_mode_match(p_hwfn, &modes_buf_offset)) { + /* Mode match - read and dump registers */ + offset += qed_grc_dump_reg_entry(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + reg_data->mask_address, + 1); + offset += qed_grc_dump_reg_entry(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + GET_FIELD(reg_data->data, + DBG_ATTN_REG_STS_ADDRESS), + 1); + num_reg_entries += 2; + } + } + } + + /* Write storm stall status registers */ + for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) { + if (dev_data->block_in_reset[s_storm_defs[storm_id].block_id] && + dump) + continue; + + offset += qed_grc_dump_reg_entry(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + BYTES_TO_DWORDS(s_storm_defs[storm_id]. + sem_fast_mem_addr + + SEM_FAST_REG_STALLED), + 1); + num_reg_entries++; + } + + /* Write header */ + if (dump) + qed_grc_dump_regs_hdr(dump_buf, + true, + num_reg_entries, "eng", -1, NULL, NULL); + return offset; +} + +/* Dumps a GRC memory header (section and params). + * The following parameters are dumped: + * name - name is dumped only if it's not NULL. + * addr - byte_addr is dumped only if name is NULL. + * len - dword_len is always dumped. + * width - bit_width is dumped if it's not zero. + * packed - packed=1 is dumped if it's not false. + * mem_group - mem_group is always dumped. + * is_storm - true only if the memory is related to a Storm. + * storm_letter - storm letter (valid only if is_storm is true). + * Returns the dumped size in dwords. + */ +static u32 qed_grc_dump_mem_hdr(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + bool dump, + const char *name, + u32 byte_addr, + u32 dword_len, + u32 bit_width, + bool packed, + const char *mem_group, + bool is_storm, char storm_letter) +{ + u8 num_params = 3; + u32 offset = 0; + char buf[64]; + + if (!dword_len) + DP_NOTICE(p_hwfn, + "Unexpected GRC Dump error: dumped memory size must be non-zero\n"); + if (bit_width) + num_params++; + if (packed) + num_params++; + + /* Dump section header */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "grc_mem", num_params); + if (name) { + /* Dump name */ + if (is_storm) { + strcpy(buf, "?STORM_"); + buf[0] = storm_letter; + strcpy(buf + strlen(buf), name); + } else { + strcpy(buf, name); + } + + offset += qed_dump_str_param(dump_buf + offset, + dump, "name", buf); + if (dump) + DP_VERBOSE(p_hwfn, + QED_MSG_DEBUG, + "Dumping %d registers from %s...\n", + dword_len, buf); + } else { + /* Dump address */ + offset += qed_dump_num_param(dump_buf + offset, + dump, "addr", byte_addr); + if (dump && dword_len > 64) + DP_VERBOSE(p_hwfn, + QED_MSG_DEBUG, + "Dumping %d registers from address 0x%x...\n", + dword_len, byte_addr); + } + + /* Dump len */ + offset += qed_dump_num_param(dump_buf + offset, dump, "len", dword_len); + + /* Dump bit width */ + if (bit_width) + offset += qed_dump_num_param(dump_buf + offset, + dump, "width", bit_width); + + /* Dump packed */ + if (packed) + offset += qed_dump_num_param(dump_buf + offset, + dump, "packed", 1); + + /* Dump reg type */ + if (is_storm) { + strcpy(buf, "?STORM_"); + buf[0] = storm_letter; + strcpy(buf + strlen(buf), mem_group); + } else { + strcpy(buf, mem_group); + } + + offset += qed_dump_str_param(dump_buf + offset, dump, "type", buf); + return offset; +} + +/* Dumps a single GRC memory. If name is NULL, the memory is stored by address. + * Returns the dumped size in dwords. + */ +static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, + const char *name, + u32 byte_addr, + u32 dword_len, + u32 bit_width, + bool packed, + const char *mem_group, + bool is_storm, char storm_letter) +{ + u32 offset = 0; + + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + name, + byte_addr, + dword_len, + bit_width, + packed, + mem_group, is_storm, storm_letter); + if (dump) { + u32 i; + + for (i = 0; i < dword_len; + i++, byte_addr += BYTES_IN_DWORD, offset++) + *(dump_buf + offset) = qed_rd(p_hwfn, p_ptt, byte_addr); + } else { + offset += dword_len; + } + + return offset; +} + +/* Dumps GRC memories entries. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_mem_entries(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct dbg_array input_mems_arr, + u32 *dump_buf, bool dump) +{ + u32 i, offset = 0, input_offset = 0; + bool mode_match = true; + + while (input_offset < input_mems_arr.size_in_dwords) { + const struct dbg_dump_cond_hdr *cond_hdr; + u32 num_entries; + bool eval_mode; + + cond_hdr = (const struct dbg_dump_cond_hdr *) + &input_mems_arr.ptr[input_offset++]; + eval_mode = GET_FIELD(cond_hdr->mode.data, + DBG_MODE_HDR_EVAL_MODE) > 0; + + /* Check required mode */ + if (eval_mode) { + u16 modes_buf_offset = + GET_FIELD(cond_hdr->mode.data, + DBG_MODE_HDR_MODES_BUF_OFFSET); + + mode_match = qed_is_mode_match(p_hwfn, + &modes_buf_offset); + } + + if (!mode_match) { + input_offset += cond_hdr->data_size; + continue; + } + + num_entries = cond_hdr->data_size / MEM_DUMP_ENTRY_SIZE_DWORDS; + for (i = 0; i < num_entries; + i++, input_offset += MEM_DUMP_ENTRY_SIZE_DWORDS) { + const struct dbg_dump_mem *mem = + (const struct dbg_dump_mem *) + &input_mems_arr.ptr[input_offset]; + u8 mem_group_id; + + mem_group_id = GET_FIELD(mem->dword0, + DBG_DUMP_MEM_MEM_GROUP_ID); + if (mem_group_id >= MEM_GROUPS_NUM) { + DP_NOTICE(p_hwfn, "Invalid mem_group_id\n"); + return 0; + } + + if (qed_grc_is_mem_included(p_hwfn, + (enum block_id)cond_hdr->block_id, + mem_group_id)) { + u32 mem_byte_addr = + DWORDS_TO_BYTES(GET_FIELD(mem->dword0, + DBG_DUMP_MEM_ADDRESS)); + u32 mem_len = GET_FIELD(mem->dword1, + DBG_DUMP_MEM_LENGTH); + char storm_letter = 'a'; + bool is_storm = false; + + /* Update memory length for CCFC/TCFC memories + * according to number of LCIDs/LTIDs. + */ + if (mem_group_id == MEM_GROUP_CONN_CFC_MEM) + mem_len = qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LCIDS) + * (mem_len / MAX_LCIDS); + else if (mem_group_id == MEM_GROUP_TASK_CFC_MEM) + mem_len = qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LTIDS) + * (mem_len / MAX_LTIDS); + + /* If memory is associated with Storm, update + * Storm details. + */ + if (s_block_defs[cond_hdr->block_id]-> + associated_to_storm) { + is_storm = true; + storm_letter = + s_storm_defs[s_block_defs[ + cond_hdr->block_id]-> + storm_id].letter; + } + + /* Dump memory */ + offset += qed_grc_dump_mem(p_hwfn, p_ptt, + dump_buf + offset, dump, NULL, + mem_byte_addr, mem_len, 0, + false, + s_mem_group_names[mem_group_id], + is_storm, storm_letter); + } + } + } + + return offset; +} + +/* Dumps GRC memories according to the input array dump_mem. + * Returns the dumped size in dwords. + */ +static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump) +{ + u32 offset = 0, input_offset = 0; + + while (input_offset < + s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].size_in_dwords) { + const struct dbg_dump_split_hdr *split_hdr = + (const struct dbg_dump_split_hdr *) + &s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset++]; + u8 split_type_id = GET_FIELD(split_hdr->hdr, + DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID); + u32 split_data_size = GET_FIELD(split_hdr->hdr, + DBG_DUMP_SPLIT_HDR_DATA_SIZE); + struct dbg_array curr_input_mems_arr = { + &s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset], + split_data_size}; + + switch (split_type_id) { + case SPLIT_TYPE_NONE: + offset += qed_grc_dump_mem_entries(p_hwfn, + p_ptt, + curr_input_mems_arr, + dump_buf + offset, + dump); + break; + default: + DP_NOTICE(p_hwfn, + "Dumping split memories is currently not supported\n"); + break; + } + + input_offset += split_data_size; + } + + return offset; +} + +/* Dumps GRC context data for the specified Storm. + * Returns the dumped size in dwords. + */ +static u32 qed_grc_dump_ctx_data(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, + const char *name, + u32 num_lids, + u32 lid_size, + u32 rd_reg_addr, + u8 storm_id) +{ + u32 i, lid, total_size; + u32 offset = 0; + + if (!lid_size) + return 0; + lid_size *= BYTES_IN_DWORD; + total_size = num_lids * lid_size; + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + name, + 0, + total_size, + lid_size * 32, + false, + name, + true, s_storm_defs[storm_id].letter); + + /* Dump context data */ + if (dump) { + for (lid = 0; lid < num_lids; lid++) { + for (i = 0; i < lid_size; i++, offset++) { + qed_wr(p_hwfn, + p_ptt, + s_storm_defs[storm_id].cm_ctx_wr_addr, + BIT(9) | lid); + *(dump_buf + offset) = qed_rd(p_hwfn, + p_ptt, + rd_reg_addr); + } + } + } else { + offset += total_size; + } + + return offset; +} + +/* Dumps GRC contexts. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_ctx(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + u32 offset = 0; + u8 storm_id; + + for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) { + if (!qed_grc_is_storm_included(p_hwfn, + (enum dbg_storms)storm_id)) + continue; + + /* Dump Conn AG context size */ + offset += + qed_grc_dump_ctx_data(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + "CONN_AG_CTX", + qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LCIDS), + s_storm_defs[storm_id]. + cm_conn_ag_ctx_lid_size, + s_storm_defs[storm_id]. + cm_conn_ag_ctx_rd_addr, + storm_id); + + /* Dump Conn ST context size */ + offset += + qed_grc_dump_ctx_data(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + "CONN_ST_CTX", + qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LCIDS), + s_storm_defs[storm_id]. + cm_conn_st_ctx_lid_size, + s_storm_defs[storm_id]. + cm_conn_st_ctx_rd_addr, + storm_id); + + /* Dump Task AG context size */ + offset += + qed_grc_dump_ctx_data(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + "TASK_AG_CTX", + qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LTIDS), + s_storm_defs[storm_id]. + cm_task_ag_ctx_lid_size, + s_storm_defs[storm_id]. + cm_task_ag_ctx_rd_addr, + storm_id); + + /* Dump Task ST context size */ + offset += + qed_grc_dump_ctx_data(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + "TASK_ST_CTX", + qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LTIDS), + s_storm_defs[storm_id]. + cm_task_st_ctx_lid_size, + s_storm_defs[storm_id]. + cm_task_st_ctx_rd_addr, + storm_id); + } + + return offset; +} + +/* Dumps GRC IORs data. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_iors(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + char buf[10] = "IOR_SET_?"; + u8 storm_id, set_id; + u32 offset = 0; + + for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) { + if (qed_grc_is_storm_included(p_hwfn, + (enum dbg_storms)storm_id)) { + for (set_id = 0; set_id < NUM_IOR_SETS; set_id++) { + u32 addr = + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_STORM_REG_FILE + + DWORDS_TO_BYTES(IOR_SET_OFFSET(set_id)); + + buf[strlen(buf) - 1] = '0' + set_id; + offset += qed_grc_dump_mem(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + buf, + addr, + IORS_PER_SET, + 32, + false, + "ior", + true, + s_storm_defs + [storm_id].letter); + } + } + } + + return offset; +} + +/* Dump VFC CAM. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_vfc_cam(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump, u8 storm_id) +{ + u32 total_size = VFC_CAM_NUM_ROWS * VFC_CAM_RESP_DWORDS; + u32 cam_addr[VFC_CAM_ADDR_DWORDS] = { 0 }; + u32 cam_cmd[VFC_CAM_CMD_DWORDS] = { 0 }; + u32 offset = 0; + u32 row, i; + + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + "vfc_cam", + 0, + total_size, + 256, + false, + "vfc_cam", + true, s_storm_defs[storm_id].letter); + if (dump) { + /* Prepare CAM address */ + SET_VAR_FIELD(cam_addr, VFC_CAM_ADDR, OP, VFC_OPCODE_CAM_RD); + for (row = 0; row < VFC_CAM_NUM_ROWS; + row++, offset += VFC_CAM_RESP_DWORDS) { + /* Write VFC CAM command */ + SET_VAR_FIELD(cam_cmd, VFC_CAM_CMD, ROW, row); + ARR_REG_WR(p_hwfn, + p_ptt, + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_VFC_DATA_WR, + cam_cmd, VFC_CAM_CMD_DWORDS); + + /* Write VFC CAM address */ + ARR_REG_WR(p_hwfn, + p_ptt, + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_VFC_ADDR, + cam_addr, VFC_CAM_ADDR_DWORDS); + + /* Read VFC CAM read response */ + ARR_REG_RD(p_hwfn, + p_ptt, + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_VFC_DATA_RD, + dump_buf + offset, VFC_CAM_RESP_DWORDS); + } + } else { + offset += total_size; + } + + return offset; +} + +/* Dump VFC RAM. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_vfc_ram(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, + u8 storm_id, struct vfc_ram_defs *ram_defs) +{ + u32 total_size = ram_defs->num_rows * VFC_RAM_RESP_DWORDS; + u32 ram_addr[VFC_RAM_ADDR_DWORDS] = { 0 }; + u32 ram_cmd[VFC_RAM_CMD_DWORDS] = { 0 }; + u32 offset = 0; + u32 row, i; + + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + ram_defs->mem_name, + 0, + total_size, + 256, + false, + ram_defs->type_name, + true, s_storm_defs[storm_id].letter); + + /* Prepare RAM address */ + SET_VAR_FIELD(ram_addr, VFC_RAM_ADDR, OP, VFC_OPCODE_RAM_RD); + + if (!dump) + return offset + total_size; + + for (row = ram_defs->base_row; + row < ram_defs->base_row + ram_defs->num_rows; + row++, offset += VFC_RAM_RESP_DWORDS) { + /* Write VFC RAM command */ + ARR_REG_WR(p_hwfn, + p_ptt, + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_VFC_DATA_WR, + ram_cmd, VFC_RAM_CMD_DWORDS); + + /* Write VFC RAM address */ + SET_VAR_FIELD(ram_addr, VFC_RAM_ADDR, ROW, row); + ARR_REG_WR(p_hwfn, + p_ptt, + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_VFC_ADDR, + ram_addr, VFC_RAM_ADDR_DWORDS); + + /* Read VFC RAM read response */ + ARR_REG_RD(p_hwfn, + p_ptt, + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_VFC_DATA_RD, + dump_buf + offset, VFC_RAM_RESP_DWORDS); + } + + return offset; +} + +/* Dumps GRC VFC data. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_vfc(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u8 storm_id, i; + u32 offset = 0; + + for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) { + if (qed_grc_is_storm_included(p_hwfn, + (enum dbg_storms)storm_id) && + s_storm_defs[storm_id].has_vfc && + (storm_id != DBG_PSTORM_ID || + dev_data->platform_id == PLATFORM_ASIC)) { + /* Read CAM */ + offset += qed_grc_dump_vfc_cam(p_hwfn, + p_ptt, + dump_buf + offset, + dump, storm_id); + + /* Read RAM */ + for (i = 0; i < NUM_VFC_RAM_TYPES; i++) + offset += qed_grc_dump_vfc_ram(p_hwfn, + p_ptt, + dump_buf + + offset, + dump, + storm_id, + &s_vfc_ram_defs + [i]); + } + } + + return offset; +} + +/* Dumps GRC RSS data. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_rss(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 offset = 0; + u8 rss_mem_id; + + for (rss_mem_id = 0; rss_mem_id < NUM_RSS_MEM_TYPES; rss_mem_id++) { + struct rss_mem_defs *rss_defs = &s_rss_mem_defs[rss_mem_id]; + u32 num_entries = rss_defs->num_entries[dev_data->chip_id]; + u32 entry_width = rss_defs->entry_width[dev_data->chip_id]; + u32 total_size = (num_entries * entry_width) / 32; + bool packed = (entry_width == 16); + u32 addr = rss_defs->addr; + u32 i, j; + + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + rss_defs->mem_name, + addr, + total_size, + entry_width, + packed, + rss_defs->type_name, false, 0); + + if (!dump) { + offset += total_size; + continue; + } + + /* Dump RSS data */ + for (i = 0; i < BYTES_TO_DWORDS(total_size); i++, addr++) { + qed_wr(p_hwfn, p_ptt, RSS_REG_RSS_RAM_ADDR, addr); + for (j = 0; j < BYTES_IN_DWORD; j++, offset++) + *(dump_buf + offset) = + qed_rd(p_hwfn, p_ptt, + RSS_REG_RSS_RAM_DATA + + DWORDS_TO_BYTES(j)); + } + } + + return offset; +} + +/* Dumps GRC Big RAM. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump, u8 big_ram_id) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + char mem_name[12] = "???_BIG_RAM"; + char type_name[8] = "???_RAM"; + u32 ram_size, total_blocks; + u32 offset = 0, i, j; + + total_blocks = + s_big_ram_defs[big_ram_id].num_of_blocks[dev_data->chip_id]; + ram_size = total_blocks * BIG_RAM_BLOCK_SIZE_DWORDS; + + strncpy(type_name, s_big_ram_defs[big_ram_id].instance_name, + strlen(s_big_ram_defs[big_ram_id].instance_name)); + strncpy(mem_name, s_big_ram_defs[big_ram_id].instance_name, + strlen(s_big_ram_defs[big_ram_id].instance_name)); + + /* Dump memory header */ + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + mem_name, + 0, + ram_size, + BIG_RAM_BLOCK_SIZE_BYTES * 8, + false, type_name, false, 0); + + if (!dump) + return offset + ram_size; + + /* Read and dump Big RAM data */ + for (i = 0; i < total_blocks / 2; i++) { + qed_wr(p_hwfn, p_ptt, s_big_ram_defs[big_ram_id].addr_reg_addr, + i); + for (j = 0; j < 2 * BIG_RAM_BLOCK_SIZE_DWORDS; j++, offset++) + *(dump_buf + offset) = qed_rd(p_hwfn, p_ptt, + s_big_ram_defs[big_ram_id]. + data_reg_addr + + DWORDS_TO_BYTES(j)); + } + + return offset; +} + +static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + bool block_enable[MAX_BLOCK_ID] = { 0 }; + bool halted = false; + u32 offset = 0; + + /* Halt MCP */ + if (dump) { + halted = !qed_mcp_halt(p_hwfn, p_ptt); + if (!halted) + DP_NOTICE(p_hwfn, "MCP halt failed!\n"); + } + + /* Dump MCP scratchpad */ + offset += qed_grc_dump_mem(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + NULL, + MCP_REG_SCRATCH, + MCP_REG_SCRATCH_SIZE, + 0, false, "MCP", false, 0); + + /* Dump MCP cpu_reg_file */ + offset += qed_grc_dump_mem(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + NULL, + MCP_REG_CPU_REG_FILE, + MCP_REG_CPU_REG_FILE_SIZE, + 0, false, "MCP", false, 0); + + /* Dump MCP registers */ + block_enable[BLOCK_MCP] = true; + offset += qed_grc_dump_registers(p_hwfn, + p_ptt, + dump_buf + offset, + dump, block_enable, "block", "MCP"); + + /* Dump required non-MCP registers */ + offset += qed_grc_dump_regs_hdr(dump_buf + offset, + dump, 1, "eng", -1, "block", "MCP"); + offset += qed_grc_dump_reg_entry(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + BYTES_TO_DWORDS + (MISC_REG_SHARED_MEM_ADDR), 1); + + /* Release MCP */ + if (halted && qed_mcp_resume(p_hwfn, p_ptt)) + DP_NOTICE(p_hwfn, "Failed to resume MCP after halt!\n"); + return offset; +} + +/* Dumps the tbus indirect memory for all PHYs. */ +static u32 qed_grc_dump_phy(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + u32 offset = 0, tbus_lo_offset, tbus_hi_offset; + char mem_name[32]; + u8 phy_id; + + for (phy_id = 0; phy_id < ARRAY_SIZE(s_phy_defs); phy_id++) { + struct phy_defs *phy_defs = &s_phy_defs[phy_id]; + int printed_chars; + + printed_chars = snprintf(mem_name, sizeof(mem_name), "tbus_%s", + phy_defs->phy_name); + if (printed_chars < 0 || printed_chars >= sizeof(mem_name)) + DP_NOTICE(p_hwfn, + "Unexpected debug error: invalid PHY memory name\n"); + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + mem_name, + 0, + PHY_DUMP_SIZE_DWORDS, + 16, true, mem_name, false, 0); + if (dump) { + u32 addr_lo_addr = phy_defs->base_addr + + phy_defs->tbus_addr_lo_addr; + u32 addr_hi_addr = phy_defs->base_addr + + phy_defs->tbus_addr_hi_addr; + u32 data_lo_addr = phy_defs->base_addr + + phy_defs->tbus_data_lo_addr; + u32 data_hi_addr = phy_defs->base_addr + + phy_defs->tbus_data_hi_addr; + u8 *bytes_buf = (u8 *)(dump_buf + offset); + + for (tbus_hi_offset = 0; + tbus_hi_offset < (NUM_PHY_TBUS_ADDRESSES >> 8); + tbus_hi_offset++) { + qed_wr(p_hwfn, + p_ptt, addr_hi_addr, tbus_hi_offset); + for (tbus_lo_offset = 0; tbus_lo_offset < 256; + tbus_lo_offset++) { + qed_wr(p_hwfn, + p_ptt, + addr_lo_addr, tbus_lo_offset); + *(bytes_buf++) = + (u8)qed_rd(p_hwfn, p_ptt, + data_lo_addr); + *(bytes_buf++) = + (u8)qed_rd(p_hwfn, p_ptt, + data_hi_addr); + } + } + } + + offset += PHY_DUMP_SIZE_DWORDS; + } + + return offset; +} + +static void qed_config_dbg_line(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + enum block_id block_id, + u8 line_id, + u8 cycle_en, + u8 right_shift, u8 force_valid, u8 force_frame) +{ + struct block_defs *p_block_defs = s_block_defs[block_id]; + + qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_select_addr, line_id); + qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_cycle_enable_addr, cycle_en); + qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_shift_addr, right_shift); + qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_force_valid_addr, force_valid); + qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_force_frame_addr, force_frame); +} + +/* Dumps Static Debug data. Returns the dumped size in dwords. */ +static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump) +{ + u32 block_dwords = NUM_DBG_BUS_LINES * STATIC_DEBUG_LINE_DWORDS; + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 offset = 0, block_id, line_id, addr, i; + struct block_defs *p_block_defs; + + if (dump) { + DP_VERBOSE(p_hwfn, + QED_MSG_DEBUG, "Dumping static debug data...\n"); + + /* Disable all blocks debug output */ + for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) { + p_block_defs = s_block_defs[block_id]; + + if (p_block_defs->has_dbg_bus[dev_data->chip_id]) + qed_wr(p_hwfn, p_ptt, + p_block_defs->dbg_cycle_enable_addr, 0); + } + + qed_bus_reset_dbg_block(p_hwfn, p_ptt); + qed_bus_set_framing_mode(p_hwfn, + p_ptt, DBG_BUS_FRAME_MODE_8HW_0ST); + qed_wr(p_hwfn, + p_ptt, DBG_REG_DEBUG_TARGET, DBG_BUS_TARGET_ID_INT_BUF); + qed_wr(p_hwfn, p_ptt, DBG_REG_FULL_MODE, 1); + qed_bus_enable_dbg_block(p_hwfn, p_ptt, true); + } + + /* Dump all static debug lines for each relevant block */ + for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) { + p_block_defs = s_block_defs[block_id]; + + if (!p_block_defs->has_dbg_bus[dev_data->chip_id]) + continue; + + /* Dump static section params */ + offset += qed_grc_dump_mem_hdr(p_hwfn, + dump_buf + offset, + dump, + p_block_defs->name, 0, + block_dwords, 32, false, + "STATIC", false, 0); + + if (dump && !dev_data->block_in_reset[block_id]) { + u8 dbg_client_id = + p_block_defs->dbg_client_id[dev_data->chip_id]; + + /* Enable block's client */ + qed_bus_enable_clients(p_hwfn, p_ptt, + BIT(dbg_client_id)); + + for (line_id = 0; line_id < NUM_DBG_BUS_LINES; + line_id++) { + /* Configure debug line ID */ + qed_config_dbg_line(p_hwfn, + p_ptt, + (enum block_id)block_id, + (u8)line_id, + 0xf, 0, 0, 0); + + /* Read debug line info */ + for (i = 0, addr = DBG_REG_CALENDAR_OUT_DATA; + i < STATIC_DEBUG_LINE_DWORDS; + i++, offset++, addr += BYTES_IN_DWORD) + dump_buf[offset] = qed_rd(p_hwfn, p_ptt, + addr); + } + + /* Disable block's client and debug output */ + qed_bus_enable_clients(p_hwfn, p_ptt, 0); + qed_wr(p_hwfn, p_ptt, + p_block_defs->dbg_cycle_enable_addr, 0); + } else { + /* All lines are invalid - dump zeros */ + if (dump) + memset(dump_buf + offset, 0, + DWORDS_TO_BYTES(block_dwords)); + offset += block_dwords; + } + } + + if (dump) { + qed_bus_enable_dbg_block(p_hwfn, p_ptt, false); + qed_bus_enable_clients(p_hwfn, p_ptt, 0); + } + + return offset; +} + +/* Performs GRC Dump to the specified buffer. + * Returns the dumped size in dwords. + */ +static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, u32 *num_dumped_dwords) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + bool parities_masked = false; + u8 i, port_mode = 0; + u32 offset = 0; + + /* Check if emulation platform */ + *num_dumped_dwords = 0; + + /* Fill GRC parameters that were not set by the user with their default + * value. + */ + qed_dbg_grc_set_params_default(p_hwfn); + + /* Find port mode */ + if (dump) { + switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) { + case 0: + port_mode = 1; + break; + case 1: + port_mode = 2; + break; + case 2: + port_mode = 4; + break; + } + } + + /* Update reset state */ + if (dump) + qed_update_blocks_reset_state(p_hwfn, p_ptt); + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 4); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "grc-dump"); + offset += qed_dump_num_param(dump_buf + offset, + dump, + "num-lcids", + qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LCIDS)); + offset += qed_dump_num_param(dump_buf + offset, + dump, + "num-ltids", + qed_grc_get_param(p_hwfn, + DBG_GRC_PARAM_NUM_LTIDS)); + offset += qed_dump_num_param(dump_buf + offset, + dump, "num-ports", port_mode); + + /* Dump reset registers (dumped before taking blocks out of reset ) */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS)) + offset += qed_grc_dump_reset_regs(p_hwfn, + p_ptt, + dump_buf + offset, dump); + + /* Take all blocks out of reset (using reset registers) */ + if (dump) { + qed_grc_unreset_blocks(p_hwfn, p_ptt); + qed_update_blocks_reset_state(p_hwfn, p_ptt); + } + + /* Disable all parities using MFW command */ + if (dump) { + parities_masked = !qed_mcp_mask_parities(p_hwfn, p_ptt, 1); + if (!parities_masked) { + if (qed_grc_get_param + (p_hwfn, DBG_GRC_PARAM_PARITY_SAFE)) + return DBG_STATUS_MCP_COULD_NOT_MASK_PRTY; + else + DP_NOTICE(p_hwfn, + "Failed to mask parities using MFW\n"); + } + } + + /* Dump modified registers (dumped before modifying them) */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS)) + offset += qed_grc_dump_modified_regs(p_hwfn, + p_ptt, + dump_buf + offset, dump); + + /* Stall storms */ + if (dump && + (qed_grc_is_included(p_hwfn, + DBG_GRC_PARAM_DUMP_IOR) || + qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_VFC))) + qed_grc_stall_storms(p_hwfn, p_ptt, true); + + /* Dump all regs */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS)) { + /* Dump all blocks except MCP */ + bool block_enable[MAX_BLOCK_ID]; + + for (i = 0; i < MAX_BLOCK_ID; i++) + block_enable[i] = true; + block_enable[BLOCK_MCP] = false; + offset += qed_grc_dump_registers(p_hwfn, + p_ptt, + dump_buf + + offset, + dump, + block_enable, NULL, NULL); + } + + /* Dump memories */ + offset += qed_grc_dump_memories(p_hwfn, p_ptt, dump_buf + offset, dump); + + /* Dump MCP */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_MCP)) + offset += qed_grc_dump_mcp(p_hwfn, + p_ptt, dump_buf + offset, dump); + + /* Dump context */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM_CTX)) + offset += qed_grc_dump_ctx(p_hwfn, + p_ptt, dump_buf + offset, dump); + + /* Dump RSS memories */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_RSS)) + offset += qed_grc_dump_rss(p_hwfn, + p_ptt, dump_buf + offset, dump); + + /* Dump Big RAM */ + for (i = 0; i < NUM_BIG_RAM_TYPES; i++) + if (qed_grc_is_included(p_hwfn, s_big_ram_defs[i].grc_param)) + offset += qed_grc_dump_big_ram(p_hwfn, + p_ptt, + dump_buf + offset, + dump, i); + + /* Dump IORs */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IOR)) + offset += qed_grc_dump_iors(p_hwfn, + p_ptt, dump_buf + offset, dump); + + /* Dump VFC */ + if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_VFC)) + offset += qed_grc_dump_vfc(p_hwfn, + p_ptt, dump_buf + offset, dump); + + /* Dump PHY tbus */ + if (qed_grc_is_included(p_hwfn, + DBG_GRC_PARAM_DUMP_PHY) && dev_data->chip_id == + CHIP_K2 && dev_data->platform_id == PLATFORM_ASIC) + offset += qed_grc_dump_phy(p_hwfn, + p_ptt, dump_buf + offset, dump); + + /* Dump static debug data */ + if (qed_grc_is_included(p_hwfn, + DBG_GRC_PARAM_DUMP_STATIC) && + dev_data->bus.state == DBG_BUS_STATE_IDLE) + offset += qed_grc_dump_static_debug(p_hwfn, + p_ptt, + dump_buf + offset, dump); + + /* Dump last section */ + offset += qed_dump_last_section(dump_buf, offset, dump); + if (dump) { + /* Unstall storms */ + if (qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_UNSTALL)) + qed_grc_stall_storms(p_hwfn, p_ptt, false); + + /* Clear parity status */ + qed_grc_clear_all_prty(p_hwfn, p_ptt); + + /* Enable all parities using MFW command */ + if (parities_masked) + qed_mcp_mask_parities(p_hwfn, p_ptt, 0); + } + + *num_dumped_dwords = offset; + + return DBG_STATUS_OK; +} + +/* Writes the specified failing Idle Check rule to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 * + dump_buf, + bool dump, + u16 rule_id, + const struct dbg_idle_chk_rule *rule, + u16 fail_entry_id, u32 *cond_reg_values) +{ + const union dbg_idle_chk_reg *regs = &((const union dbg_idle_chk_reg *) + s_dbg_arrays + [BIN_BUF_DBG_IDLE_CHK_REGS]. + ptr)[rule->reg_offset]; + const struct dbg_idle_chk_cond_reg *cond_regs = ®s[0].cond_reg; + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + struct dbg_idle_chk_result_hdr *hdr = + (struct dbg_idle_chk_result_hdr *)dump_buf; + const struct dbg_idle_chk_info_reg *info_regs = + ®s[rule->num_cond_regs].info_reg; + u32 next_reg_offset = 0, i, offset = 0; + u8 reg_id; + + /* Dump rule data */ + if (dump) { + memset(hdr, 0, sizeof(*hdr)); + hdr->rule_id = rule_id; + hdr->mem_entry_id = fail_entry_id; + hdr->severity = rule->severity; + hdr->num_dumped_cond_regs = rule->num_cond_regs; + } + + offset += IDLE_CHK_RESULT_HDR_DWORDS; + + /* Dump condition register values */ + for (reg_id = 0; reg_id < rule->num_cond_regs; reg_id++) { + const struct dbg_idle_chk_cond_reg *reg = &cond_regs[reg_id]; + + /* Write register header */ + if (dump) { + struct dbg_idle_chk_result_reg_hdr *reg_hdr = + (struct dbg_idle_chk_result_reg_hdr *)(dump_buf + + offset); + offset += IDLE_CHK_RESULT_REG_HDR_DWORDS; + memset(reg_hdr, 0, + sizeof(struct dbg_idle_chk_result_reg_hdr)); + reg_hdr->start_entry = reg->start_entry; + reg_hdr->size = reg->entry_size; + SET_FIELD(reg_hdr->data, + DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM, + reg->num_entries > 1 || reg->start_entry > 0 + ? 1 : 0); + SET_FIELD(reg_hdr->data, + DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID, reg_id); + + /* Write register values */ + for (i = 0; i < reg_hdr->size; + i++, next_reg_offset++, offset++) + dump_buf[offset] = + cond_reg_values[next_reg_offset]; + } else { + offset += IDLE_CHK_RESULT_REG_HDR_DWORDS + + reg->entry_size; + } + } + + /* Dump info register values */ + for (reg_id = 0; reg_id < rule->num_info_regs; reg_id++) { + const struct dbg_idle_chk_info_reg *reg = &info_regs[reg_id]; + u32 block_id; + + if (!dump) { + offset += IDLE_CHK_RESULT_REG_HDR_DWORDS + reg->size; + continue; + } + + /* Check if register's block is in reset */ + block_id = GET_FIELD(reg->data, DBG_IDLE_CHK_INFO_REG_BLOCK_ID); + if (block_id >= MAX_BLOCK_ID) { + DP_NOTICE(p_hwfn, "Invalid block_id\n"); + return 0; + } + + if (!dev_data->block_in_reset[block_id]) { + bool eval_mode = GET_FIELD(reg->mode.data, + DBG_MODE_HDR_EVAL_MODE) > 0; + bool mode_match = true; + + /* Check mode */ + if (eval_mode) { + u16 modes_buf_offset = + GET_FIELD(reg->mode.data, + DBG_MODE_HDR_MODES_BUF_OFFSET); + mode_match = + qed_is_mode_match(p_hwfn, + &modes_buf_offset); + } + + if (mode_match) { + u32 grc_addr = + DWORDS_TO_BYTES(GET_FIELD(reg->data, + DBG_IDLE_CHK_INFO_REG_ADDRESS)); + + /* Write register header */ + struct dbg_idle_chk_result_reg_hdr *reg_hdr = + (struct dbg_idle_chk_result_reg_hdr *) + (dump_buf + offset); + + offset += IDLE_CHK_RESULT_REG_HDR_DWORDS; + hdr->num_dumped_info_regs++; + memset(reg_hdr, 0, sizeof(*reg_hdr)); + reg_hdr->size = reg->size; + SET_FIELD(reg_hdr->data, + DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID, + rule->num_cond_regs + reg_id); + + /* Write register values */ + for (i = 0; i < reg->size; + i++, offset++, grc_addr += 4) + dump_buf[offset] = + qed_rd(p_hwfn, p_ptt, grc_addr); + } + } + } + + return offset; +} + +/* Dumps idle check rule entries. Returns the dumped size in dwords. */ +static u32 +qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + u32 *dump_buf, bool dump, + const struct dbg_idle_chk_rule *input_rules, + u32 num_input_rules, u32 *num_failing_rules) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + u32 cond_reg_values[IDLE_CHK_MAX_ENTRIES_SIZE]; + u32 i, j, offset = 0; + u16 entry_id; + u8 reg_id; + + *num_failing_rules = 0; + for (i = 0; i < num_input_rules; i++) { + const struct dbg_idle_chk_cond_reg *cond_regs; + const struct dbg_idle_chk_rule *rule; + const union dbg_idle_chk_reg *regs; + u16 num_reg_entries = 1; + bool check_rule = true; + const u32 *imm_values; + + rule = &input_rules[i]; + regs = &((const union dbg_idle_chk_reg *) + s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr) + [rule->reg_offset]; + cond_regs = ®s[0].cond_reg; + imm_values = &s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_IMMS].ptr + [rule->imm_offset]; + + /* Check if all condition register blocks are out of reset, and + * find maximal number of entries (all condition registers that + * are memories must have the same size, which is > 1). + */ + for (reg_id = 0; reg_id < rule->num_cond_regs && check_rule; + reg_id++) { + u32 block_id = GET_FIELD(cond_regs[reg_id].data, + DBG_IDLE_CHK_COND_REG_BLOCK_ID); + + if (block_id >= MAX_BLOCK_ID) { + DP_NOTICE(p_hwfn, "Invalid block_id\n"); + return 0; + } + + check_rule = !dev_data->block_in_reset[block_id]; + if (cond_regs[reg_id].num_entries > num_reg_entries) + num_reg_entries = cond_regs[reg_id].num_entries; + } + + if (!check_rule && dump) + continue; + + /* Go over all register entries (number of entries is the same + * for all condition registers). + */ + for (entry_id = 0; entry_id < num_reg_entries; entry_id++) { + /* Read current entry of all condition registers */ + if (dump) { + u32 next_reg_offset = 0; + + for (reg_id = 0; + reg_id < rule->num_cond_regs; + reg_id++) { + const struct dbg_idle_chk_cond_reg + *reg = &cond_regs[reg_id]; + + /* Find GRC address (if it's a memory, + * the address of the specific entry is + * calculated). + */ + u32 grc_addr = + DWORDS_TO_BYTES( + GET_FIELD(reg->data, + DBG_IDLE_CHK_COND_REG_ADDRESS)); + + if (reg->num_entries > 1 || + reg->start_entry > 0) { + u32 padded_entry_size = + reg->entry_size > 1 ? + roundup_pow_of_two + (reg->entry_size) : 1; + + grc_addr += + DWORDS_TO_BYTES( + (reg->start_entry + + entry_id) + * padded_entry_size); + } + + /* Read registers */ + if (next_reg_offset + reg->entry_size >= + IDLE_CHK_MAX_ENTRIES_SIZE) { + DP_NOTICE(p_hwfn, + "idle check registers entry is too large\n"); + return 0; + } + + for (j = 0; j < reg->entry_size; + j++, next_reg_offset++, + grc_addr += 4) + cond_reg_values[next_reg_offset] = + qed_rd(p_hwfn, p_ptt, grc_addr); + } + } + + /* Call rule's condition function - a return value of + * true indicates failure. + */ + if ((*cond_arr[rule->cond_id])(cond_reg_values, + imm_values) || !dump) { + offset += + qed_idle_chk_dump_failure(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + rule->rule_id, + rule, + entry_id, + cond_reg_values); + (*num_failing_rules)++; + break; + } + } + } + + return offset; +} + +/* Performs Idle Check Dump to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_idle_chk_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + u32 offset = 0, input_offset = 0, num_failing_rules = 0; + u32 num_failing_rules_offset; + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 1); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "idle-chk"); + + /* Dump idle check section header with a single parameter */ + offset += qed_dump_section_hdr(dump_buf + offset, dump, "idle_chk", 1); + num_failing_rules_offset = offset; + offset += qed_dump_num_param(dump_buf + offset, dump, "num_rules", 0); + while (input_offset < + s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].size_in_dwords) { + const struct dbg_idle_chk_cond_hdr *cond_hdr = + (const struct dbg_idle_chk_cond_hdr *) + &s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].ptr + [input_offset++]; + bool eval_mode = GET_FIELD(cond_hdr->mode.data, + DBG_MODE_HDR_EVAL_MODE) > 0; + bool mode_match = true; + + /* Check mode */ + if (eval_mode) { + u16 modes_buf_offset = + GET_FIELD(cond_hdr->mode.data, + DBG_MODE_HDR_MODES_BUF_OFFSET); + + mode_match = qed_is_mode_match(p_hwfn, + &modes_buf_offset); + } + + if (mode_match) { + u32 curr_failing_rules; + + offset += + qed_idle_chk_dump_rule_entries(p_hwfn, + p_ptt, + dump_buf + offset, + dump, + (const struct dbg_idle_chk_rule *) + &s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES]. + ptr[input_offset], + cond_hdr->data_size / IDLE_CHK_RULE_SIZE_DWORDS, + &curr_failing_rules); + num_failing_rules += curr_failing_rules; + } + + input_offset += cond_hdr->data_size; + } + + /* Overwrite num_rules parameter */ + if (dump) + qed_dump_num_param(dump_buf + num_failing_rules_offset, + dump, "num_rules", num_failing_rules); + + return offset; +} + +/* Finds the meta data image in NVRAM. */ +static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 image_type, + u32 *nvram_offset_bytes, + u32 *nvram_size_bytes) +{ + u32 ret_mcp_resp, ret_mcp_param, ret_txn_size; + struct mcp_file_att file_att; + + /* Call NVRAM get file command */ + if (qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_NVM_GET_FILE_ATT, + image_type, &ret_mcp_resp, &ret_mcp_param, + &ret_txn_size, (u32 *)&file_att) != 0) + return DBG_STATUS_NVRAM_GET_IMAGE_FAILED; + + /* Check response */ + if ((ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK) + return DBG_STATUS_NVRAM_GET_IMAGE_FAILED; + + /* Update return values */ + *nvram_offset_bytes = file_att.nvm_start_addr; + *nvram_size_bytes = file_att.len; + DP_VERBOSE(p_hwfn, + QED_MSG_DEBUG, + "find_nvram_image: found NVRAM image of type %d in NVRAM offset %d bytes with size %d bytes\n", + image_type, *nvram_offset_bytes, *nvram_size_bytes); + + /* Check alignment */ + if (*nvram_size_bytes & 0x3) + return DBG_STATUS_NON_ALIGNED_NVRAM_IMAGE; + return DBG_STATUS_OK; +} + +static enum dbg_status qed_nvram_read(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 nvram_offset_bytes, + u32 nvram_size_bytes, u32 *ret_buf) +{ + u32 ret_mcp_resp, ret_mcp_param, ret_read_size; + u32 bytes_to_copy, read_offset = 0; + s32 bytes_left = nvram_size_bytes; + + DP_VERBOSE(p_hwfn, + QED_MSG_DEBUG, + "nvram_read: reading image of size %d bytes from NVRAM\n", + nvram_size_bytes); + do { + bytes_to_copy = + (bytes_left > + MCP_DRV_NVM_BUF_LEN) ? MCP_DRV_NVM_BUF_LEN : bytes_left; + + /* Call NVRAM read command */ + if (qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt, + DRV_MSG_CODE_NVM_READ_NVRAM, + (nvram_offset_bytes + + read_offset) | + (bytes_to_copy << + DRV_MB_PARAM_NVM_LEN_SHIFT), + &ret_mcp_resp, &ret_mcp_param, + &ret_read_size, + (u32 *)((u8 *)ret_buf + + read_offset)) != 0) + return DBG_STATUS_NVRAM_READ_FAILED; + + /* Check response */ + if ((ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK) + return DBG_STATUS_NVRAM_READ_FAILED; + + /* Update read offset */ + read_offset += ret_read_size; + bytes_left -= ret_read_size; + } while (bytes_left > 0); + + return DBG_STATUS_OK; +} + +/* Get info on the MCP Trace data in the scratchpad: + * - trace_data_grc_addr - the GRC address of the trace data + * - trace_data_size_bytes - the size in bytes of the MCP Trace data (without + * the header) + */ +static enum dbg_status qed_mcp_trace_get_data_info(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *trace_data_grc_addr, + u32 *trace_data_size_bytes) +{ + /* Read MCP trace section offsize structure from MCP scratchpad */ + u32 spad_trace_offsize = qed_rd(p_hwfn, + p_ptt, + MCP_SPAD_TRACE_OFFSIZE_ADDR); + u32 signature; + + /* Extract MCP trace section GRC address from offsize structure (within + * scratchpad). + */ + *trace_data_grc_addr = + MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize); + + /* Read signature from MCP trace section */ + signature = qed_rd(p_hwfn, p_ptt, + *trace_data_grc_addr + + offsetof(struct mcp_trace, signature)); + if (signature != MFW_TRACE_SIGNATURE) + return DBG_STATUS_INVALID_TRACE_SIGNATURE; + + /* Read trace size from MCP trace section */ + *trace_data_size_bytes = qed_rd(p_hwfn, + p_ptt, + *trace_data_grc_addr + + offsetof(struct mcp_trace, size)); + return DBG_STATUS_OK; +} + +/* Reads MCP trace meta data image from NVRAM. + * - running_bundle_id (OUT) - the running bundle ID (invalid when loaded from + * file) + * - trace_meta_offset_bytes (OUT) - the NVRAM offset in bytes in which the MCP + * Trace meta data starts (invalid when loaded from file) + * - trace_meta_size_bytes (OUT) - the size in bytes of the MCP Trace meta data + */ +static enum dbg_status qed_mcp_trace_get_meta_info(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 trace_data_size_bytes, + u32 *running_bundle_id, + u32 *trace_meta_offset_bytes, + u32 *trace_meta_size_bytes) +{ + /* Read MCP trace section offsize structure from MCP scratchpad */ + u32 spad_trace_offsize = qed_rd(p_hwfn, + p_ptt, + MCP_SPAD_TRACE_OFFSIZE_ADDR); + + /* Find running bundle ID */ + u32 running_mfw_addr = + MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize) + + QED_SECTION_SIZE(spad_trace_offsize) + trace_data_size_bytes; + enum dbg_status status; + u32 nvram_image_type; + + *running_bundle_id = qed_rd(p_hwfn, p_ptt, running_mfw_addr); + if (*running_bundle_id > 1) + return DBG_STATUS_INVALID_NVRAM_BUNDLE; + + /* Find image in NVRAM */ + nvram_image_type = + (*running_bundle_id == + DIR_ID_1) ? NVM_TYPE_MFW_TRACE1 : NVM_TYPE_MFW_TRACE2; + status = qed_find_nvram_image(p_hwfn, + p_ptt, + nvram_image_type, + trace_meta_offset_bytes, + trace_meta_size_bytes); + + return status; +} + +/* Reads the MCP Trace data from the specified GRC address into the specified + * buffer. + */ +static void qed_mcp_trace_read_data(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 grc_addr, u32 size_in_dwords, u32 *buf) +{ + u32 i; + + DP_VERBOSE(p_hwfn, + QED_MSG_DEBUG, + "mcp_trace_read_data: reading trace data of size %d dwords from GRC address 0x%x\n", + size_in_dwords, grc_addr); + for (i = 0; i < size_in_dwords; i++, grc_addr += BYTES_IN_DWORD) + buf[i] = qed_rd(p_hwfn, p_ptt, grc_addr); +} + +/* Reads the MCP Trace meta data (from NVRAM or buffer) into the specified + * buffer. + */ +static enum dbg_status qed_mcp_trace_read_meta(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 nvram_offset_in_bytes, + u32 size_in_bytes, u32 *buf) +{ + u8 *byte_buf = (u8 *)buf; + u8 modules_num, i; + u32 signature; + + /* Read meta data from NVRAM */ + enum dbg_status status = qed_nvram_read(p_hwfn, + p_ptt, + nvram_offset_in_bytes, + size_in_bytes, + buf); + + if (status != DBG_STATUS_OK) + return status; + + /* Extract and check first signature */ + signature = qed_read_unaligned_dword(byte_buf); + byte_buf += sizeof(u32); + if (signature != MCP_TRACE_META_IMAGE_SIGNATURE) + return DBG_STATUS_INVALID_TRACE_SIGNATURE; + + /* Extract number of modules */ + modules_num = *(byte_buf++); + + /* Skip all modules */ + for (i = 0; i < modules_num; i++) { + u8 module_len = *(byte_buf++); + + byte_buf += module_len; + } + + /* Extract and check second signature */ + signature = qed_read_unaligned_dword(byte_buf); + byte_buf += sizeof(u32); + if (signature != MCP_TRACE_META_IMAGE_SIGNATURE) + return DBG_STATUS_INVALID_TRACE_SIGNATURE; + return DBG_STATUS_OK; +} + +/* Dump MCP Trace */ +enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, u32 *num_dumped_dwords) +{ + u32 trace_data_grc_addr, trace_data_size_bytes, trace_data_size_dwords; + u32 trace_meta_size_dwords, running_bundle_id, offset = 0; + u32 trace_meta_offset_bytes, trace_meta_size_bytes; + enum dbg_status status; + int halted = 0; + + *num_dumped_dwords = 0; + + /* Get trace data info */ + status = qed_mcp_trace_get_data_info(p_hwfn, + p_ptt, + &trace_data_grc_addr, + &trace_data_size_bytes); + if (status != DBG_STATUS_OK) + return status; + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 1); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "mcp-trace"); + + /* Halt MCP while reading from scratchpad so the read data will be + * consistent if halt fails, MCP trace is taken anyway, with a small + * risk that it may be corrupt. + */ + if (dump) { + halted = !qed_mcp_halt(p_hwfn, p_ptt); + if (!halted) + DP_NOTICE(p_hwfn, "MCP halt failed!\n"); + } + + /* Find trace data size */ + trace_data_size_dwords = + DIV_ROUND_UP(trace_data_size_bytes + sizeof(struct mcp_trace), + BYTES_IN_DWORD); + + /* Dump trace data section header and param */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "mcp_trace_data", 1); + offset += qed_dump_num_param(dump_buf + offset, + dump, "size", trace_data_size_dwords); + + /* Read trace data from scratchpad into dump buffer */ + if (dump) + qed_mcp_trace_read_data(p_hwfn, + p_ptt, + trace_data_grc_addr, + trace_data_size_dwords, + dump_buf + offset); + offset += trace_data_size_dwords; + + /* Resume MCP (only if halt succeeded) */ + if (halted && qed_mcp_resume(p_hwfn, p_ptt) != 0) + DP_NOTICE(p_hwfn, "Failed to resume MCP after halt!\n"); + + /* Dump trace meta section header */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "mcp_trace_meta", 1); + + /* Read trace meta info */ + status = qed_mcp_trace_get_meta_info(p_hwfn, + p_ptt, + trace_data_size_bytes, + &running_bundle_id, + &trace_meta_offset_bytes, + &trace_meta_size_bytes); + if (status != DBG_STATUS_OK) + return status; + + /* Dump trace meta size param (trace_meta_size_bytes is always + * dword-aligned). + */ + trace_meta_size_dwords = BYTES_TO_DWORDS(trace_meta_size_bytes); + offset += qed_dump_num_param(dump_buf + offset, dump, "size", + trace_meta_size_dwords); + + /* Read trace meta image into dump buffer */ + if (dump) { + status = qed_mcp_trace_read_meta(p_hwfn, + p_ptt, + trace_meta_offset_bytes, + trace_meta_size_bytes, + dump_buf + offset); + if (status != DBG_STATUS_OK) + return status; + } + + offset += trace_meta_size_dwords; + + *num_dumped_dwords = offset; + + return DBG_STATUS_OK; +} + +/* Dump GRC FIFO */ +enum dbg_status qed_reg_fifo_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, u32 *num_dumped_dwords) +{ + u32 offset = 0, dwords_read, size_param_offset; + bool fifo_has_data; + + *num_dumped_dwords = 0; + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 1); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "reg-fifo"); + + /* Dump fifo data section header and param. The size param is 0 for now, + * and is overwritten after reading the FIFO. + */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "reg_fifo_data", 1); + size_param_offset = offset; + offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0); + + if (!dump) { + /* FIFO max size is REG_FIFO_DEPTH_DWORDS. There is no way to + * test how much data is available, except for reading it. + */ + offset += REG_FIFO_DEPTH_DWORDS; + *num_dumped_dwords = offset; + return DBG_STATUS_OK; + } + + fifo_has_data = qed_rd(p_hwfn, p_ptt, + GRC_REG_TRACE_FIFO_VALID_DATA) > 0; + + /* Pull available data from fifo. Use DMAE since this is widebus memory + * and must be accessed atomically. Test for dwords_read not passing + * buffer size since more entries could be added to the buffer as we are + * emptying it. + */ + for (dwords_read = 0; + fifo_has_data && dwords_read < REG_FIFO_DEPTH_DWORDS; + dwords_read += REG_FIFO_ELEMENT_DWORDS, offset += + REG_FIFO_ELEMENT_DWORDS) { + if (qed_dmae_grc2host(p_hwfn, p_ptt, GRC_REG_TRACE_FIFO, + (u64)(uintptr_t)(&dump_buf[offset]), + REG_FIFO_ELEMENT_DWORDS, 0)) + return DBG_STATUS_DMAE_FAILED; + fifo_has_data = qed_rd(p_hwfn, p_ptt, + GRC_REG_TRACE_FIFO_VALID_DATA) > 0; + } + + qed_dump_num_param(dump_buf + size_param_offset, dump, "size", + dwords_read); + + *num_dumped_dwords = offset; + return DBG_STATUS_OK; +} + +/* Dump IGU FIFO */ +enum dbg_status qed_igu_fifo_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, u32 *num_dumped_dwords) +{ + u32 offset = 0, dwords_read, size_param_offset; + bool fifo_has_data; + + *num_dumped_dwords = 0; + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 1); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "igu-fifo"); + + /* Dump fifo data section header and param. The size param is 0 for now, + * and is overwritten after reading the FIFO. + */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "igu_fifo_data", 1); + size_param_offset = offset; + offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0); + + if (!dump) { + /* FIFO max size is IGU_FIFO_DEPTH_DWORDS. There is no way to + * test how much data is available, except for reading it. + */ + offset += IGU_FIFO_DEPTH_DWORDS; + *num_dumped_dwords = offset; + return DBG_STATUS_OK; + } + + fifo_has_data = qed_rd(p_hwfn, p_ptt, + IGU_REG_ERROR_HANDLING_DATA_VALID) > 0; + + /* Pull available data from fifo. Use DMAE since this is widebus memory + * and must be accessed atomically. Test for dwords_read not passing + * buffer size since more entries could be added to the buffer as we are + * emptying it. + */ + for (dwords_read = 0; + fifo_has_data && dwords_read < IGU_FIFO_DEPTH_DWORDS; + dwords_read += IGU_FIFO_ELEMENT_DWORDS, offset += + IGU_FIFO_ELEMENT_DWORDS) { + if (qed_dmae_grc2host(p_hwfn, p_ptt, + IGU_REG_ERROR_HANDLING_MEMORY, + (u64)(uintptr_t)(&dump_buf[offset]), + IGU_FIFO_ELEMENT_DWORDS, 0)) + return DBG_STATUS_DMAE_FAILED; + fifo_has_data = qed_rd(p_hwfn, p_ptt, + IGU_REG_ERROR_HANDLING_DATA_VALID) > 0; + } + + qed_dump_num_param(dump_buf + size_param_offset, dump, "size", + dwords_read); + + *num_dumped_dwords = offset; + return DBG_STATUS_OK; +} + +/* Protection Override dump */ +enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + bool dump, u32 *num_dumped_dwords) +{ + u32 offset = 0, size_param_offset, override_window_dwords; + + *num_dumped_dwords = 0; + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 1); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "protection-override"); + + /* Dump data section header and param. The size param is 0 for now, and + * is overwritten after reading the data. + */ + offset += qed_dump_section_hdr(dump_buf + offset, + dump, "protection_override_data", 1); + size_param_offset = offset; + offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0); + + if (!dump) { + offset += PROTECTION_OVERRIDE_DEPTH_DWORDS; + *num_dumped_dwords = offset; + return DBG_STATUS_OK; + } + + /* Add override window info to buffer */ + override_window_dwords = + qed_rd(p_hwfn, p_ptt, + GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * + PROTECTION_OVERRIDE_ELEMENT_DWORDS; + if (qed_dmae_grc2host(p_hwfn, p_ptt, + GRC_REG_PROTECTION_OVERRIDE_WINDOW, + (u64)(uintptr_t)(dump_buf + offset), + override_window_dwords, 0)) + return DBG_STATUS_DMAE_FAILED; + offset += override_window_dwords; + qed_dump_num_param(dump_buf + size_param_offset, dump, "size", + override_window_dwords); + + *num_dumped_dwords = offset; + return DBG_STATUS_OK; +} + +/* Performs FW Asserts Dump to the specified buffer. + * Returns the dumped size in dwords. + */ +static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, bool dump) +{ + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + char storm_letter_str[2] = "?"; + struct fw_info fw_info; + u32 offset = 0, i; + u8 storm_id; + + /* Dump global params */ + offset += qed_dump_common_global_params(p_hwfn, + p_ptt, + dump_buf + offset, dump, 1); + offset += qed_dump_str_param(dump_buf + offset, + dump, "dump-type", "fw-asserts"); + for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) { + u32 fw_asserts_section_addr, next_list_idx_addr, next_list_idx, + last_list_idx, element_addr; + + if (dev_data->block_in_reset[s_storm_defs[storm_id].block_id]) + continue; + + /* Read FW info for the current Storm */ + qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info); + + /* Dump FW Asserts section header and params */ + storm_letter_str[0] = s_storm_defs[storm_id].letter; + offset += qed_dump_section_hdr(dump_buf + offset, dump, + "fw_asserts", 2); + offset += qed_dump_str_param(dump_buf + offset, dump, "storm", + storm_letter_str); + offset += qed_dump_num_param(dump_buf + offset, dump, "size", + fw_info.fw_asserts_section. + list_element_dword_size); + + if (!dump) { + offset += fw_info.fw_asserts_section. + list_element_dword_size; + continue; + } + + /* Read and dump FW Asserts data */ + fw_asserts_section_addr = + s_storm_defs[storm_id].sem_fast_mem_addr + + SEM_FAST_REG_INT_RAM + + RAM_LINES_TO_BYTES(fw_info.fw_asserts_section. + section_ram_line_offset); + next_list_idx_addr = + fw_asserts_section_addr + + DWORDS_TO_BYTES(fw_info.fw_asserts_section. + list_next_index_dword_offset); + next_list_idx = qed_rd(p_hwfn, p_ptt, next_list_idx_addr); + last_list_idx = (next_list_idx > 0 + ? next_list_idx + : fw_info.fw_asserts_section.list_num_elements) + - 1; + element_addr = + fw_asserts_section_addr + + DWORDS_TO_BYTES(fw_info.fw_asserts_section. + list_dword_offset) + + last_list_idx * + DWORDS_TO_BYTES(fw_info.fw_asserts_section. + list_element_dword_size); + for (i = 0; + i < fw_info.fw_asserts_section.list_element_dword_size; + i++, offset++, element_addr += BYTES_IN_DWORD) + dump_buf[offset] = qed_rd(p_hwfn, p_ptt, element_addr); + } + + /* Dump last section */ + offset += qed_dump_section_hdr(dump_buf + offset, dump, "last", 0); + return offset; +} + +/***************************** Public Functions *******************************/ + +enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr) +{ + /* Convert binary data to debug arrays */ + u32 num_of_buffers = *(u32 *)bin_ptr; + struct bin_buffer_hdr *buf_array; + u8 buf_id; + + buf_array = (struct bin_buffer_hdr *)((u32 *)bin_ptr + 1); + + for (buf_id = 0; buf_id < num_of_buffers; buf_id++) { + s_dbg_arrays[buf_id].ptr = + (u32 *)(bin_ptr + buf_array[buf_id].offset); + s_dbg_arrays[buf_id].size_in_dwords = + BYTES_TO_DWORDS(buf_array[buf_id].length); + } + + return DBG_STATUS_OK; +} + +enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr || + !s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr || + !s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr || + !s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr || + !s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr) + return DBG_STATUS_DBG_ARRAY_NOT_SET; + return qed_grc_dump(p_hwfn, p_ptt, NULL, false, buf_size); +} + +enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_grc_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + *num_dumped_dwords = 0; + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + /* GRC Dump */ + status = qed_grc_dump(p_hwfn, p_ptt, dump_buf, true, num_dumped_dwords); + + /* Clear all GRC params */ + qed_dbg_grc_clear_params(p_hwfn); + return status; +} + +enum dbg_status qed_dbg_idle_chk_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + struct dbg_tools_data *dev_data = &p_hwfn->dbg_info; + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr || + !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr || + !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_IMMS].ptr || + !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].ptr) + return DBG_STATUS_DBG_ARRAY_NOT_SET; + if (!dev_data->idle_chk.buf_size_set) { + dev_data->idle_chk.buf_size = qed_idle_chk_dump(p_hwfn, + p_ptt, + NULL, false); + dev_data->idle_chk.buf_size_set = true; + } + + *buf_size = dev_data->idle_chk.buf_size; + return DBG_STATUS_OK; +} + +enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_idle_chk_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + *num_dumped_dwords = 0; + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + /* Update reset state */ + qed_update_blocks_reset_state(p_hwfn, p_ptt); + + /* Idle Check Dump */ + *num_dumped_dwords = qed_idle_chk_dump(p_hwfn, p_ptt, dump_buf, true); + return DBG_STATUS_OK; +} + +enum dbg_status qed_dbg_mcp_trace_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + return qed_mcp_trace_dump(p_hwfn, p_ptt, NULL, false, buf_size); +} + +enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_mcp_trace_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + /* Update reset state */ + qed_update_blocks_reset_state(p_hwfn, p_ptt); + + /* Perform dump */ + return qed_mcp_trace_dump(p_hwfn, + p_ptt, dump_buf, true, num_dumped_dwords); +} + +enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + return qed_reg_fifo_dump(p_hwfn, p_ptt, NULL, false, buf_size); +} + +enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_reg_fifo_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + *num_dumped_dwords = 0; + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + /* Update reset state */ + qed_update_blocks_reset_state(p_hwfn, p_ptt); + return qed_reg_fifo_dump(p_hwfn, + p_ptt, dump_buf, true, num_dumped_dwords); +} + +enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + return qed_igu_fifo_dump(p_hwfn, p_ptt, NULL, false, buf_size); +} + +enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_igu_fifo_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + *num_dumped_dwords = 0; + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + /* Update reset state */ + qed_update_blocks_reset_state(p_hwfn, p_ptt); + return qed_igu_fifo_dump(p_hwfn, + p_ptt, dump_buf, true, num_dumped_dwords); +} + +enum dbg_status +qed_dbg_protection_override_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + return qed_protection_override_dump(p_hwfn, + p_ptt, NULL, false, buf_size); +} + +enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_protection_override_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + *num_dumped_dwords = 0; + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + /* Update reset state */ + qed_update_blocks_reset_state(p_hwfn, p_ptt); + return qed_protection_override_dump(p_hwfn, + p_ptt, + dump_buf, true, num_dumped_dwords); +} + +enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size) +{ + enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt); + + *buf_size = 0; + if (status != DBG_STATUS_OK) + return status; + + /* Update reset state */ + qed_update_blocks_reset_state(p_hwfn, p_ptt); + *buf_size = qed_fw_asserts_dump(p_hwfn, p_ptt, NULL, false); + return DBG_STATUS_OK; +} + +enum dbg_status qed_dbg_fw_asserts_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords) +{ + u32 needed_buf_size_in_dwords; + enum dbg_status status; + + status = qed_dbg_fw_asserts_get_dump_buf_size(p_hwfn, p_ptt, + &needed_buf_size_in_dwords); + + *num_dumped_dwords = 0; + if (status != DBG_STATUS_OK) + return status; + if (buf_size_in_dwords < needed_buf_size_in_dwords) + return DBG_STATUS_DUMP_BUF_TOO_SMALL; + + *num_dumped_dwords = qed_fw_asserts_dump(p_hwfn, p_ptt, dump_buf, true); + return DBG_STATUS_OK; +} + +/******************************* Data Types **********************************/ + +struct mcp_trace_format { + u32 data; +#define MCP_TRACE_FORMAT_MODULE_MASK 0x0000ffff +#define MCP_TRACE_FORMAT_MODULE_SHIFT 0 +#define MCP_TRACE_FORMAT_LEVEL_MASK 0x00030000 +#define MCP_TRACE_FORMAT_LEVEL_SHIFT 16 +#define MCP_TRACE_FORMAT_P1_SIZE_MASK 0x000c0000 +#define MCP_TRACE_FORMAT_P1_SIZE_SHIFT 18 +#define MCP_TRACE_FORMAT_P2_SIZE_MASK 0x00300000 +#define MCP_TRACE_FORMAT_P2_SIZE_SHIFT 20 +#define MCP_TRACE_FORMAT_P3_SIZE_MASK 0x00c00000 +#define MCP_TRACE_FORMAT_P3_SIZE_SHIFT 22 +#define MCP_TRACE_FORMAT_LEN_MASK 0xff000000 +#define MCP_TRACE_FORMAT_LEN_SHIFT 24 + char *format_str; +}; + +struct mcp_trace_meta { + u32 modules_num; + char **modules; + u32 formats_num; + struct mcp_trace_format *formats; +}; + +/* Reg fifo element */ +struct reg_fifo_element { + u64 data; +#define REG_FIFO_ELEMENT_ADDRESS_SHIFT 0 +#define REG_FIFO_ELEMENT_ADDRESS_MASK 0x7fffff +#define REG_FIFO_ELEMENT_ACCESS_SHIFT 23 +#define REG_FIFO_ELEMENT_ACCESS_MASK 0x1 +#define REG_FIFO_ELEMENT_PF_SHIFT 24 +#define REG_FIFO_ELEMENT_PF_MASK 0xf +#define REG_FIFO_ELEMENT_VF_SHIFT 28 +#define REG_FIFO_ELEMENT_VF_MASK 0xff +#define REG_FIFO_ELEMENT_PORT_SHIFT 36 +#define REG_FIFO_ELEMENT_PORT_MASK 0x3 +#define REG_FIFO_ELEMENT_PRIVILEGE_SHIFT 38 +#define REG_FIFO_ELEMENT_PRIVILEGE_MASK 0x3 +#define REG_FIFO_ELEMENT_PROTECTION_SHIFT 40 +#define REG_FIFO_ELEMENT_PROTECTION_MASK 0x7 +#define REG_FIFO_ELEMENT_MASTER_SHIFT 43 +#define REG_FIFO_ELEMENT_MASTER_MASK 0xf +#define REG_FIFO_ELEMENT_ERROR_SHIFT 47 +#define REG_FIFO_ELEMENT_ERROR_MASK 0x1f +}; + +/* IGU fifo element */ +struct igu_fifo_element { + u32 dword0; +#define IGU_FIFO_ELEMENT_DWORD0_FID_SHIFT 0 +#define IGU_FIFO_ELEMENT_DWORD0_FID_MASK 0xff +#define IGU_FIFO_ELEMENT_DWORD0_IS_PF_SHIFT 8 +#define IGU_FIFO_ELEMENT_DWORD0_IS_PF_MASK 0x1 +#define IGU_FIFO_ELEMENT_DWORD0_SOURCE_SHIFT 9 +#define IGU_FIFO_ELEMENT_DWORD0_SOURCE_MASK 0xf +#define IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE_SHIFT 13 +#define IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE_MASK 0xf +#define IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR_SHIFT 17 +#define IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR_MASK 0x7fff + u32 dword1; + u32 dword2; +#define IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD_SHIFT 0 +#define IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD_MASK 0x1 +#define IGU_FIFO_ELEMENT_DWORD12_WR_DATA_SHIFT 1 +#define IGU_FIFO_ELEMENT_DWORD12_WR_DATA_MASK 0xffffffff + u32 reserved; +}; + +struct igu_fifo_wr_data { + u32 data; +#define IGU_FIFO_WR_DATA_PROD_CONS_SHIFT 0 +#define IGU_FIFO_WR_DATA_PROD_CONS_MASK 0xffffff +#define IGU_FIFO_WR_DATA_UPDATE_FLAG_SHIFT 24 +#define IGU_FIFO_WR_DATA_UPDATE_FLAG_MASK 0x1 +#define IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB_SHIFT 25 +#define IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB_MASK 0x3 +#define IGU_FIFO_WR_DATA_SEGMENT_SHIFT 27 +#define IGU_FIFO_WR_DATA_SEGMENT_MASK 0x1 +#define IGU_FIFO_WR_DATA_TIMER_MASK_SHIFT 28 +#define IGU_FIFO_WR_DATA_TIMER_MASK_MASK 0x1 +#define IGU_FIFO_WR_DATA_CMD_TYPE_SHIFT 31 +#define IGU_FIFO_WR_DATA_CMD_TYPE_MASK 0x1 +}; + +struct igu_fifo_cleanup_wr_data { + u32 data; +#define IGU_FIFO_CLEANUP_WR_DATA_RESERVED_SHIFT 0 +#define IGU_FIFO_CLEANUP_WR_DATA_RESERVED_MASK 0x7ffffff +#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL_SHIFT 27 +#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL_MASK 0x1 +#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE_SHIFT 28 +#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE_MASK 0x7 +#define IGU_FIFO_CLEANUP_WR_DATA_CMD_TYPE_SHIFT 31 +#define IGU_FIFO_CLEANUP_WR_DATA_CMD_TYPE_MASK 0x1 +}; + +/* Protection override element */ +struct protection_override_element { + u64 data; +#define PROTECTION_OVERRIDE_ELEMENT_ADDRESS_SHIFT 0 +#define PROTECTION_OVERRIDE_ELEMENT_ADDRESS_MASK 0x7fffff +#define PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE_SHIFT 23 +#define PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE_MASK 0xffffff +#define PROTECTION_OVERRIDE_ELEMENT_READ_SHIFT 47 +#define PROTECTION_OVERRIDE_ELEMENT_READ_MASK 0x1 +#define PROTECTION_OVERRIDE_ELEMENT_WRITE_SHIFT 48 +#define PROTECTION_OVERRIDE_ELEMENT_WRITE_MASK 0x1 +#define PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION_SHIFT 49 +#define PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION_MASK 0x7 +#define PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION_SHIFT 52 +#define PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION_MASK 0x7 +}; + +enum igu_fifo_sources { + IGU_SRC_PXP0, + IGU_SRC_PXP1, + IGU_SRC_PXP2, + IGU_SRC_PXP3, + IGU_SRC_PXP4, + IGU_SRC_PXP5, + IGU_SRC_PXP6, + IGU_SRC_PXP7, + IGU_SRC_CAU, + IGU_SRC_ATTN, + IGU_SRC_GRC +}; + +enum igu_fifo_addr_types { + IGU_ADDR_TYPE_MSIX_MEM, + IGU_ADDR_TYPE_WRITE_PBA, + IGU_ADDR_TYPE_WRITE_INT_ACK, + IGU_ADDR_TYPE_WRITE_ATTN_BITS, + IGU_ADDR_TYPE_READ_INT, + IGU_ADDR_TYPE_WRITE_PROD_UPDATE, + IGU_ADDR_TYPE_RESERVED +}; + +struct igu_fifo_addr_data { + u16 start_addr; + u16 end_addr; + char *desc; + char *vf_desc; + enum igu_fifo_addr_types type; +}; + +/******************************** Constants **********************************/ + +#define MAX_MSG_LEN 1024 +#define MCP_TRACE_MAX_MODULE_LEN 8 +#define MCP_TRACE_FORMAT_MAX_PARAMS 3 +#define MCP_TRACE_FORMAT_PARAM_WIDTH \ + (MCP_TRACE_FORMAT_P2_SIZE_SHIFT - MCP_TRACE_FORMAT_P1_SIZE_SHIFT) +#define REG_FIFO_ELEMENT_ADDR_FACTOR 4 +#define REG_FIFO_ELEMENT_IS_PF_VF_VAL 127 +#define PROTECTION_OVERRIDE_ELEMENT_ADDR_FACTOR 4 + +/********************************* Macros ************************************/ + +#define BYTES_TO_DWORDS(bytes) ((bytes) / BYTES_IN_DWORD) + +/***************************** Constant Arrays *******************************/ + +/* Status string array */ +static const char * const s_status_str[] = { + "Operation completed successfully", + "Debug application version wasn't set", + "Unsupported debug application version", + "The debug block wasn't reset since the last recording", + "Invalid arguments", + "The debug output was already set", + "Invalid PCI buffer size", + "PCI buffer allocation failed", + "A PCI buffer wasn't allocated", + "Too many inputs were enabled. Enabled less inputs, or set 'unifyInputs' to true", + "GRC/Timestamp input overlap in cycle dword 0", + "Cannot record Storm data since the entire recording cycle is used by HW", + "The Storm was already enabled", + "The specified Storm wasn't enabled", + "The block was already enabled", + "The specified block wasn't enabled", + "No input was enabled for recording", + "Filters and triggers are not allowed when recording in 64b units", + "The filter was already enabled", + "The trigger was already enabled", + "The trigger wasn't enabled", + "A constraint can be added only after a filter was enabled or a trigger state was added", + "Cannot add more than 3 trigger states", + "Cannot add more than 4 constraints per filter or trigger state", + "The recording wasn't started", + "A trigger was configured, but it didn't trigger", + "No data was recorded", + "Dump buffer is too small", + "Dumped data is not aligned to chunks", + "Unknown chip", + "Failed allocating virtual memory", + "The input block is in reset", + "Invalid MCP trace signature found in NVRAM", + "Invalid bundle ID found in NVRAM", + "Failed getting NVRAM image", + "NVRAM image is not dword-aligned", + "Failed reading from NVRAM", + "Idle check parsing failed", + "MCP Trace data is corrupt", + "Dump doesn't contain meta data - it must be provided in an image file", + "Failed to halt MCP", + "Failed to resume MCP after halt", + "DMAE transaction failed", + "Failed to empty SEMI sync FIFO", + "IGU FIFO data is corrupt", + "MCP failed to mask parities", + "FW Asserts parsing failed", + "GRC FIFO data is corrupt", + "Protection Override data is corrupt", + "Debug arrays were not set (when using binary files, dbg_set_bin_ptr must be called)", + "When a block is filtered, no other blocks can be recorded unless inputs are unified (due to a HW bug)" +}; + +/* Idle check severity names array */ +static const char * const s_idle_chk_severity_str[] = { + "Error", + "Error if no traffic", + "Warning" +}; + +/* MCP Trace level names array */ +static const char * const s_mcp_trace_level_str[] = { + "ERROR", + "TRACE", + "DEBUG" +}; + +/* Parsing strings */ +static const char * const s_access_strs[] = { + "read", + "write" +}; + +static const char * const s_privilege_strs[] = { + "VF", + "PDA", + "HV", + "UA" +}; + +static const char * const s_protection_strs[] = { + "(default)", + "(default)", + "(default)", + "(default)", + "override VF", + "override PDA", + "override HV", + "override UA" +}; + +static const char * const s_master_strs[] = { + "???", + "pxp", + "mcp", + "msdm", + "psdm", + "ysdm", + "usdm", + "tsdm", + "xsdm", + "dbu", + "dmae", + "???", + "???", + "???", + "???", + "???" +}; + +static const char * const s_reg_fifo_error_strs[] = { + "grc timeout", + "address doesn't belong to any block", + "reserved address in block or write to read-only address", + "privilege/protection mismatch", + "path isolation error" +}; + +static const char * const s_igu_fifo_source_strs[] = { + "TSTORM", + "MSTORM", + "USTORM", + "XSTORM", + "YSTORM", + "PSTORM", + "PCIE", + "NIG_QM_PBF", + "CAU", + "ATTN", + "GRC", +}; + +static const char * const s_igu_fifo_error_strs[] = { + "no error", + "length error", + "function disabled", + "VF sent command to attnetion address", + "host sent prod update command", + "read of during interrupt register while in MIMD mode", + "access to PXP BAR reserved address", + "producer update command to attention index", + "unknown error", + "SB index not valid", + "SB relative index and FID not found", + "FID not match", + "command with error flag asserted (PCI error or CAU discard)", + "VF sent cleanup and RF cleanup is disabled", + "cleanup command on type bigger than 4" +}; + +/* IGU FIFO address data */ +static const struct igu_fifo_addr_data s_igu_fifo_addr_data[] = { + {0x0, 0x101, "MSI-X Memory", NULL, IGU_ADDR_TYPE_MSIX_MEM}, + {0x102, 0x1ff, "reserved", NULL, IGU_ADDR_TYPE_RESERVED}, + {0x200, 0x200, "Write PBA[0:63]", NULL, IGU_ADDR_TYPE_WRITE_PBA}, + {0x201, 0x201, "Write PBA[64:127]", "reserved", + IGU_ADDR_TYPE_WRITE_PBA}, + {0x202, 0x202, "Write PBA[128]", "reserved", IGU_ADDR_TYPE_WRITE_PBA}, + {0x203, 0x3ff, "reserved", NULL, IGU_ADDR_TYPE_RESERVED}, + {0x400, 0x5ef, "Write interrupt acknowledgment", NULL, + IGU_ADDR_TYPE_WRITE_INT_ACK}, + {0x5f0, 0x5f0, "Attention bits update", NULL, + IGU_ADDR_TYPE_WRITE_ATTN_BITS}, + {0x5f1, 0x5f1, "Attention bits set", NULL, + IGU_ADDR_TYPE_WRITE_ATTN_BITS}, + {0x5f2, 0x5f2, "Attention bits clear", NULL, + IGU_ADDR_TYPE_WRITE_ATTN_BITS}, + {0x5f3, 0x5f3, "Read interrupt 0:63 with mask", NULL, + IGU_ADDR_TYPE_READ_INT}, + {0x5f4, 0x5f4, "Read interrupt 0:31 with mask", NULL, + IGU_ADDR_TYPE_READ_INT}, + {0x5f5, 0x5f5, "Read interrupt 32:63 with mask", NULL, + IGU_ADDR_TYPE_READ_INT}, + {0x5f6, 0x5f6, "Read interrupt 0:63 without mask", NULL, + IGU_ADDR_TYPE_READ_INT}, + {0x5f7, 0x5ff, "reserved", NULL, IGU_ADDR_TYPE_RESERVED}, + {0x600, 0x7ff, "Producer update", NULL, IGU_ADDR_TYPE_WRITE_PROD_UPDATE} +}; + +/******************************** Variables **********************************/ + +/* MCP Trace meta data - used in case the dump doesn't contain the meta data + * (e.g. due to no NVRAM access). + */ +static struct dbg_array s_mcp_trace_meta = { NULL, 0 }; + +/* Temporary buffer, used for print size calculations */ +static char s_temp_buf[MAX_MSG_LEN]; + +/***************************** Public Functions *******************************/ + +enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr) +{ + /* Convert binary data to debug arrays */ + u32 num_of_buffers = *(u32 *)bin_ptr; + struct bin_buffer_hdr *buf_array; + u8 buf_id; + + buf_array = (struct bin_buffer_hdr *)((u32 *)bin_ptr + 1); + + for (buf_id = 0; buf_id < num_of_buffers; buf_id++) { + s_dbg_arrays[buf_id].ptr = + (u32 *)(bin_ptr + buf_array[buf_id].offset); + s_dbg_arrays[buf_id].size_in_dwords = + BYTES_TO_DWORDS(buf_array[buf_id].length); + } + + return DBG_STATUS_OK; +} + +static u32 qed_cyclic_add(u32 a, u32 b, u32 size) +{ + return (a + b) % size; +} + +static u32 qed_cyclic_sub(u32 a, u32 b, u32 size) +{ + return (size + a - b) % size; +} + +/* Reads the specified number of bytes from the specified cyclic buffer (up to 4 + * bytes) and returns them as a dword value. the specified buffer offset is + * updated. + */ +static u32 qed_read_from_cyclic_buf(void *buf, + u32 *offset, + u32 buf_size, u8 num_bytes_to_read) +{ + u8 *bytes_buf = (u8 *)buf; + u8 *val_ptr; + u32 val = 0; + u8 i; + + val_ptr = (u8 *)&val; + + for (i = 0; i < num_bytes_to_read; i++) { + val_ptr[i] = bytes_buf[*offset]; + *offset = qed_cyclic_add(*offset, 1, buf_size); + } + + return val; +} + +/* Reads and returns the next byte from the specified buffer. + * The specified buffer offset is updated. + */ +static u8 qed_read_byte_from_buf(void *buf, u32 *offset) +{ + return ((u8 *)buf)[(*offset)++]; +} + +/* Reads and returns the next dword from the specified buffer. + * The specified buffer offset is updated. + */ +static u32 qed_read_dword_from_buf(void *buf, u32 *offset) +{ + u32 dword_val = *(u32 *)&((u8 *)buf)[*offset]; + + *offset += 4; + return dword_val; +} + +/* Reads the next string from the specified buffer, and copies it to the + * specified pointer. The specified buffer offset is updated. + */ +static void qed_read_str_from_buf(void *buf, u32 *offset, u32 size, char *dest) +{ + const char *source_str = &((const char *)buf)[*offset]; + + strncpy(dest, source_str, size); + dest[size - 1] = '\0'; + *offset += size; +} + +/* Returns a pointer to the specified offset (in bytes) of the specified buffer. + * If the specified buffer in NULL, a temporary buffer pointer is returned. + */ +static char *qed_get_buf_ptr(void *buf, u32 offset) +{ + return buf ? (char *)buf + offset : s_temp_buf; +} + +/* Reads a param from the specified buffer. Returns the number of dwords read. + * If the returned str_param is NULL, the param is numeric and its value is + * returned in num_param. + * Otheriwise, the param is a string and its pointer is returned in str_param. + */ +static u32 qed_read_param(u32 *dump_buf, + const char **param_name, + const char **param_str_val, u32 *param_num_val) +{ + char *char_buf = (char *)dump_buf; + u32 offset = 0; /* In bytes */ + + /* Extract param name */ + *param_name = char_buf; + offset += strlen(*param_name) + 1; + + /* Check param type */ + if (*(char_buf + offset++)) { + /* String param */ + *param_str_val = char_buf + offset; + offset += strlen(*param_str_val) + 1; + if (offset & 0x3) + offset += (4 - (offset & 0x3)); + } else { + /* Numeric param */ + *param_str_val = NULL; + if (offset & 0x3) + offset += (4 - (offset & 0x3)); + *param_num_val = *(u32 *)(char_buf + offset); + offset += 4; + } + + return offset / 4; +} + +/* Reads a section header from the specified buffer. + * Returns the number of dwords read. + */ +static u32 qed_read_section_hdr(u32 *dump_buf, + const char **section_name, + u32 *num_section_params) +{ + const char *param_str_val; + + return qed_read_param(dump_buf, + section_name, ¶m_str_val, num_section_params); +} + +/* Reads section params from the specified buffer and prints them to the results + * buffer. Returns the number of dwords read. + */ +static u32 qed_print_section_params(u32 *dump_buf, + u32 num_section_params, + char *results_buf, u32 *num_chars_printed) +{ + u32 i, dump_offset = 0, results_offset = 0; + + for (i = 0; i < num_section_params; i++) { + const char *param_name; + const char *param_str_val; + u32 param_num_val = 0; + + dump_offset += qed_read_param(dump_buf + dump_offset, + ¶m_name, + ¶m_str_val, ¶m_num_val); + if (param_str_val) + /* String param */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "%s: %s\n", param_name, param_str_val); + else if (strcmp(param_name, "fw-timestamp")) + /* Numeric param */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "%s: %d\n", param_name, param_num_val); + } + + results_offset += + sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n"); + *num_chars_printed = results_offset; + return dump_offset; +} + +const char *qed_dbg_get_status_str(enum dbg_status status) +{ + return (status < + MAX_DBG_STATUS) ? s_status_str[status] : "Invalid debug status"; +} + +/* Parses the idle check rules and returns the number of characters printed. + * In case of parsing error, returns 0. + */ +static u32 qed_parse_idle_chk_dump_rules(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 *dump_buf_end, + u32 num_rules, + bool print_fw_idle_chk, + char *results_buf, + u32 *num_errors, u32 *num_warnings) +{ + u32 rule_idx, results_offset = 0; /* Offset in results_buf in bytes */ + u16 i, j; + + *num_errors = 0; + *num_warnings = 0; + + /* Go over dumped results */ + for (rule_idx = 0; rule_idx < num_rules && dump_buf < dump_buf_end; + rule_idx++) { + const struct dbg_idle_chk_rule_parsing_data *rule_parsing_data; + struct dbg_idle_chk_result_hdr *hdr; + const char *parsing_str; + u32 parsing_str_offset; + const char *lsi_msg; + u8 curr_reg_id = 0; + bool has_fw_msg; + + hdr = (struct dbg_idle_chk_result_hdr *)dump_buf; + rule_parsing_data = + (const struct dbg_idle_chk_rule_parsing_data *) + &s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_PARSING_DATA]. + ptr[hdr->rule_id]; + parsing_str_offset = + GET_FIELD(rule_parsing_data->data, + DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET); + has_fw_msg = + GET_FIELD(rule_parsing_data->data, + DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG) > 0; + parsing_str = &((const char *) + s_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr) + [parsing_str_offset]; + lsi_msg = parsing_str; + + if (hdr->severity >= MAX_DBG_IDLE_CHK_SEVERITY_TYPES) + return 0; + + /* Skip rule header */ + dump_buf += (sizeof(struct dbg_idle_chk_result_hdr) / 4); + + /* Update errors/warnings count */ + if (hdr->severity == IDLE_CHK_SEVERITY_ERROR || + hdr->severity == IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC) + (*num_errors)++; + else + (*num_warnings)++; + + /* Print rule severity */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), "%s: ", + s_idle_chk_severity_str[hdr->severity]); + + /* Print rule message */ + if (has_fw_msg) + parsing_str += strlen(parsing_str) + 1; + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), "%s.", + has_fw_msg && + print_fw_idle_chk ? parsing_str : lsi_msg); + parsing_str += strlen(parsing_str) + 1; + + /* Print register values */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), " Registers:"); + for (i = 0; + i < hdr->num_dumped_cond_regs + hdr->num_dumped_info_regs; + i++) { + struct dbg_idle_chk_result_reg_hdr *reg_hdr + = (struct dbg_idle_chk_result_reg_hdr *) + dump_buf; + bool is_mem = + GET_FIELD(reg_hdr->data, + DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM); + u8 reg_id = + GET_FIELD(reg_hdr->data, + DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID); + + /* Skip reg header */ + dump_buf += + (sizeof(struct dbg_idle_chk_result_reg_hdr) / 4); + + /* Skip register names until the required reg_id is + * reached. + */ + for (; reg_id > curr_reg_id; + curr_reg_id++, + parsing_str += strlen(parsing_str) + 1); + + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), " %s", + parsing_str); + if (i < hdr->num_dumped_cond_regs && is_mem) + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "[%d]", hdr->mem_entry_id + + reg_hdr->start_entry); + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), "="); + for (j = 0; j < reg_hdr->size; j++, dump_buf++) { + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "0x%x", *dump_buf); + if (j < reg_hdr->size - 1) + results_offset += + sprintf(qed_get_buf_ptr + (results_buf, + results_offset), ","); + } + } + + results_offset += + sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n"); + } + + /* Check if end of dump buffer was exceeded */ + if (dump_buf > dump_buf_end) + return 0; + return results_offset; +} + +/* Parses an idle check dump buffer. + * If result_buf is not NULL, the idle check results are printed to it. + * In any case, the required results buffer size is assigned to + * parsed_results_bytes. + * The parsing status is returned. + */ +static enum dbg_status qed_parse_idle_chk_dump(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *parsed_results_bytes, + u32 *num_errors, + u32 *num_warnings) +{ + const char *section_name, *param_name, *param_str_val; + u32 *dump_buf_end = dump_buf + num_dumped_dwords; + u32 num_section_params = 0, num_rules; + u32 results_offset = 0; /* Offset in results_buf in bytes */ + + *parsed_results_bytes = 0; + *num_errors = 0; + *num_warnings = 0; + if (!s_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr || + !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_PARSING_DATA].ptr) + return DBG_STATUS_DBG_ARRAY_NOT_SET; + + /* Read global_params section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "global_params")) + return DBG_STATUS_IDLE_CHK_PARSE_FAILED; + + /* Print global params */ + dump_buf += qed_print_section_params(dump_buf, + num_section_params, + results_buf, &results_offset); + + /* Read idle_chk section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "idle_chk") || num_section_params != 1) + return DBG_STATUS_IDLE_CHK_PARSE_FAILED; + + dump_buf += qed_read_param(dump_buf, + ¶m_name, ¶m_str_val, &num_rules); + if (strcmp(param_name, "num_rules") != 0) + return DBG_STATUS_IDLE_CHK_PARSE_FAILED; + + if (num_rules) { + u32 rules_print_size; + + /* Print FW output */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "FW_IDLE_CHECK:\n"); + rules_print_size = + qed_parse_idle_chk_dump_rules(p_hwfn, dump_buf, + dump_buf_end, num_rules, + true, + results_buf ? + results_buf + + results_offset : NULL, + num_errors, num_warnings); + results_offset += rules_print_size; + if (rules_print_size == 0) + return DBG_STATUS_IDLE_CHK_PARSE_FAILED; + + /* Print LSI output */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "\nLSI_IDLE_CHECK:\n"); + rules_print_size = + qed_parse_idle_chk_dump_rules(p_hwfn, dump_buf, + dump_buf_end, num_rules, + false, + results_buf ? + results_buf + + results_offset : NULL, + num_errors, num_warnings); + results_offset += rules_print_size; + if (rules_print_size == 0) + return DBG_STATUS_IDLE_CHK_PARSE_FAILED; + } + + /* Print errors/warnings count */ + if (*num_errors) { + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "\nIdle Check failed!!! (with %d errors and %d warnings)\n", + *num_errors, *num_warnings); + } else if (*num_warnings) { + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "\nIdle Check completed successfuly (with %d warnings)\n", + *num_warnings); + } else { + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "\nIdle Check completed successfuly\n"); + } + + /* Add 1 for string NULL termination */ + *parsed_results_bytes = results_offset + 1; + return DBG_STATUS_OK; +} + +enum dbg_status qed_get_idle_chk_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size) +{ + u32 num_errors, num_warnings; + + return qed_parse_idle_chk_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + NULL, + results_buf_size, + &num_errors, &num_warnings); +} + +enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *num_errors, u32 *num_warnings) +{ + u32 parsed_buf_size; + + return qed_parse_idle_chk_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + results_buf, + &parsed_buf_size, + num_errors, num_warnings); +} + +/* Frees the specified MCP Trace meta data */ +static void qed_mcp_trace_free_meta(struct qed_hwfn *p_hwfn, + struct mcp_trace_meta *meta) +{ + u32 i; + + /* Release modules */ + if (meta->modules) { + for (i = 0; i < meta->modules_num; i++) + kfree(meta->modules[i]); + kfree(meta->modules); + } + + /* Release formats */ + if (meta->formats) { + for (i = 0; i < meta->formats_num; i++) + kfree(meta->formats[i].format_str); + kfree(meta->formats); + } +} + +/* Allocates and fills MCP Trace meta data based on the specified meta data + * dump buffer. + * Returns debug status code. + */ +static enum dbg_status qed_mcp_trace_alloc_meta(struct qed_hwfn *p_hwfn, + const u32 *meta_buf, + struct mcp_trace_meta *meta) +{ + u8 *meta_buf_bytes = (u8 *)meta_buf; + u32 offset = 0, signature, i; + + memset(meta, 0, sizeof(*meta)); + + /* Read first signature */ + signature = qed_read_dword_from_buf(meta_buf_bytes, &offset); + if (signature != MCP_TRACE_META_IMAGE_SIGNATURE) + return DBG_STATUS_INVALID_TRACE_SIGNATURE; + + /* Read number of modules and allocate memory for all the modules + * pointers. + */ + meta->modules_num = qed_read_byte_from_buf(meta_buf_bytes, &offset); + meta->modules = kzalloc(meta->modules_num * sizeof(char *), GFP_KERNEL); + if (!meta->modules) + return DBG_STATUS_VIRT_MEM_ALLOC_FAILED; + + /* Allocate and read all module strings */ + for (i = 0; i < meta->modules_num; i++) { + u8 module_len = qed_read_byte_from_buf(meta_buf_bytes, &offset); + + *(meta->modules + i) = kzalloc(module_len, GFP_KERNEL); + if (!(*(meta->modules + i))) { + /* Update number of modules to be released */ + meta->modules_num = i ? i - 1 : 0; + return DBG_STATUS_VIRT_MEM_ALLOC_FAILED; + } + + qed_read_str_from_buf(meta_buf_bytes, &offset, module_len, + *(meta->modules + i)); + if (module_len > MCP_TRACE_MAX_MODULE_LEN) + (*(meta->modules + i))[MCP_TRACE_MAX_MODULE_LEN] = '\0'; + } + + /* Read second signature */ + signature = qed_read_dword_from_buf(meta_buf_bytes, &offset); + if (signature != MCP_TRACE_META_IMAGE_SIGNATURE) + return DBG_STATUS_INVALID_TRACE_SIGNATURE; + + /* Read number of formats and allocate memory for all formats */ + meta->formats_num = qed_read_dword_from_buf(meta_buf_bytes, &offset); + meta->formats = kzalloc(meta->formats_num * + sizeof(struct mcp_trace_format), + GFP_KERNEL); + if (!meta->formats) + return DBG_STATUS_VIRT_MEM_ALLOC_FAILED; + + /* Allocate and read all strings */ + for (i = 0; i < meta->formats_num; i++) { + struct mcp_trace_format *format_ptr = &meta->formats[i]; + u8 format_len; + + format_ptr->data = qed_read_dword_from_buf(meta_buf_bytes, + &offset); + format_len = + (format_ptr->data & + MCP_TRACE_FORMAT_LEN_MASK) >> MCP_TRACE_FORMAT_LEN_SHIFT; + format_ptr->format_str = kzalloc(format_len, GFP_KERNEL); + if (!format_ptr->format_str) { + /* Update number of modules to be released */ + meta->formats_num = i ? i - 1 : 0; + return DBG_STATUS_VIRT_MEM_ALLOC_FAILED; + } + + qed_read_str_from_buf(meta_buf_bytes, + &offset, + format_len, format_ptr->format_str); + } + + return DBG_STATUS_OK; +} + +/* Parses an MCP Trace dump buffer. + * If result_buf is not NULL, the MCP Trace results are printed to it. + * In any case, the required results buffer size is assigned to + * parsed_results_bytes. + * The parsing status is returned. + */ +static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *parsed_results_bytes) +{ + u32 results_offset = 0, param_mask, param_shift, param_num_val; + u32 num_section_params, offset, end_offset, bytes_left; + const char *section_name, *param_name, *param_str_val; + u32 trace_data_dwords, trace_meta_dwords; + struct mcp_trace_meta meta; + struct mcp_trace *trace; + enum dbg_status status; + const u32 *meta_buf; + u8 *trace_buf; + + *parsed_results_bytes = 0; + + /* Read global_params section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "global_params")) + return DBG_STATUS_MCP_TRACE_BAD_DATA; + + /* Print global params */ + dump_buf += qed_print_section_params(dump_buf, + num_section_params, + results_buf, &results_offset); + + /* Read trace_data section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "mcp_trace_data") || num_section_params != 1) + return DBG_STATUS_MCP_TRACE_BAD_DATA; + dump_buf += qed_read_param(dump_buf, + ¶m_name, ¶m_str_val, ¶m_num_val); + if (strcmp(param_name, "size")) + return DBG_STATUS_MCP_TRACE_BAD_DATA; + trace_data_dwords = param_num_val; + + /* Prepare trace info */ + trace = (struct mcp_trace *)dump_buf; + trace_buf = (u8 *)dump_buf + sizeof(struct mcp_trace); + offset = trace->trace_oldest; + end_offset = trace->trace_prod; + bytes_left = qed_cyclic_sub(end_offset, offset, trace->size); + dump_buf += trace_data_dwords; + + /* Read meta_data section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "mcp_trace_meta")) + return DBG_STATUS_MCP_TRACE_BAD_DATA; + dump_buf += qed_read_param(dump_buf, + ¶m_name, ¶m_str_val, ¶m_num_val); + if (strcmp(param_name, "size") != 0) + return DBG_STATUS_MCP_TRACE_BAD_DATA; + trace_meta_dwords = param_num_val; + + /* Choose meta data buffer */ + if (!trace_meta_dwords) { + /* Dump doesn't include meta data */ + if (!s_mcp_trace_meta.ptr) + return DBG_STATUS_MCP_TRACE_NO_META; + meta_buf = s_mcp_trace_meta.ptr; + } else { + /* Dump includes meta data */ + meta_buf = dump_buf; + } + + /* Allocate meta data memory */ + status = qed_mcp_trace_alloc_meta(p_hwfn, meta_buf, &meta); + if (status != DBG_STATUS_OK) + goto free_mem; + + /* Ignore the level and modules masks - just print everything that is + * already in the buffer. + */ + while (bytes_left) { + struct mcp_trace_format *format_ptr; + u8 format_level, format_module; + u32 params[3] = { 0, 0, 0 }; + u32 header, format_idx, i; + + if (bytes_left < MFW_TRACE_ENTRY_SIZE) { + status = DBG_STATUS_MCP_TRACE_BAD_DATA; + goto free_mem; + } + + header = qed_read_from_cyclic_buf(trace_buf, + &offset, + trace->size, + MFW_TRACE_ENTRY_SIZE); + bytes_left -= MFW_TRACE_ENTRY_SIZE; + format_idx = header & MFW_TRACE_EVENTID_MASK; + + /* Skip message if its index doesn't exist in the meta data */ + if (format_idx > meta.formats_num) { + u8 format_size = + (u8)((header & + MFW_TRACE_PRM_SIZE_MASK) >> + MFW_TRACE_PRM_SIZE_SHIFT); + + if (bytes_left < format_size) { + status = DBG_STATUS_MCP_TRACE_BAD_DATA; + goto free_mem; + } + + offset = qed_cyclic_add(offset, + format_size, trace->size); + bytes_left -= format_size; + continue; + } + + format_ptr = &meta.formats[format_idx]; + for (i = 0, + param_mask = MCP_TRACE_FORMAT_P1_SIZE_MASK, param_shift = + MCP_TRACE_FORMAT_P1_SIZE_SHIFT; + i < MCP_TRACE_FORMAT_MAX_PARAMS; + i++, param_mask <<= MCP_TRACE_FORMAT_PARAM_WIDTH, + param_shift += MCP_TRACE_FORMAT_PARAM_WIDTH) { + /* Extract param size (0..3) */ + u8 param_size = + (u8)((format_ptr->data & + param_mask) >> param_shift); + + /* If the param size is zero, there are no other + * parameters. + */ + if (!param_size) + break; + + /* Size is encoded using 2 bits, where 3 is used to + * encode 4. + */ + if (param_size == 3) + param_size = 4; + if (bytes_left < param_size) { + status = DBG_STATUS_MCP_TRACE_BAD_DATA; + goto free_mem; + } + + params[i] = qed_read_from_cyclic_buf(trace_buf, + &offset, + trace->size, + param_size); + bytes_left -= param_size; + } + + format_level = + (u8)((format_ptr->data & + MCP_TRACE_FORMAT_LEVEL_MASK) >> + MCP_TRACE_FORMAT_LEVEL_SHIFT); + format_module = + (u8)((format_ptr->data & + MCP_TRACE_FORMAT_MODULE_MASK) >> + MCP_TRACE_FORMAT_MODULE_SHIFT); + if (format_level >= ARRAY_SIZE(s_mcp_trace_level_str)) { + status = DBG_STATUS_MCP_TRACE_BAD_DATA; + goto free_mem; + } + + /* Print current message to results buffer */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), "%s %-8s: ", + s_mcp_trace_level_str[format_level], + meta.modules[format_module]); + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + format_ptr->format_str, params[0], params[1], + params[2]); + } + +free_mem: + *parsed_results_bytes = results_offset + 1; + qed_mcp_trace_free_meta(p_hwfn, &meta); + return status; +} + +enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size) +{ + return qed_parse_mcp_trace_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + NULL, results_buf_size); +} + +enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf) +{ + u32 parsed_buf_size; + + return qed_parse_mcp_trace_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + results_buf, &parsed_buf_size); +} + +/* Parses a Reg FIFO dump buffer. + * If result_buf is not NULL, the Reg FIFO results are printed to it. + * In any case, the required results buffer size is assigned to + * parsed_results_bytes. + * The parsing status is returned. + */ +static enum dbg_status qed_parse_reg_fifo_dump(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *parsed_results_bytes) +{ + u32 results_offset = 0, param_num_val, num_section_params, num_elements; + const char *section_name, *param_name, *param_str_val; + struct reg_fifo_element *elements; + u8 i, j, err_val, vf_val; + char vf_str[4]; + + /* Read global_params section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "global_params")) + return DBG_STATUS_REG_FIFO_BAD_DATA; + + /* Print global params */ + dump_buf += qed_print_section_params(dump_buf, + num_section_params, + results_buf, &results_offset); + + /* Read reg_fifo_data section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "reg_fifo_data")) + return DBG_STATUS_REG_FIFO_BAD_DATA; + dump_buf += qed_read_param(dump_buf, + ¶m_name, ¶m_str_val, ¶m_num_val); + if (strcmp(param_name, "size")) + return DBG_STATUS_REG_FIFO_BAD_DATA; + if (param_num_val % REG_FIFO_ELEMENT_DWORDS) + return DBG_STATUS_REG_FIFO_BAD_DATA; + num_elements = param_num_val / REG_FIFO_ELEMENT_DWORDS; + elements = (struct reg_fifo_element *)dump_buf; + + /* Decode elements */ + for (i = 0; i < num_elements; i++) { + bool err_printed = false; + + /* Discover if element belongs to a VF or a PF */ + vf_val = GET_FIELD(elements[i].data, REG_FIFO_ELEMENT_VF); + if (vf_val == REG_FIFO_ELEMENT_IS_PF_VF_VAL) + sprintf(vf_str, "%s", "N/A"); + else + sprintf(vf_str, "%d", vf_val); + + /* Add parsed element to parsed buffer */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "raw: 0x%016llx, address: 0x%07llx, access: %-5s, pf: %2lld, vf: %s, port: %lld, privilege: %-3s, protection: %-12s, master: %-4s, errors: ", + elements[i].data, + GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_ADDRESS) * + REG_FIFO_ELEMENT_ADDR_FACTOR, + s_access_strs[GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_ACCESS)], + GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_PF), vf_str, + GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_PORT), + s_privilege_strs[GET_FIELD(elements[i]. + data, + REG_FIFO_ELEMENT_PRIVILEGE)], + s_protection_strs[GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_PROTECTION)], + s_master_strs[GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_MASTER)]); + + /* Print errors */ + for (j = 0, + err_val = GET_FIELD(elements[i].data, + REG_FIFO_ELEMENT_ERROR); + j < ARRAY_SIZE(s_reg_fifo_error_strs); + j++, err_val >>= 1) { + if (!(err_val & 0x1)) + continue; + if (err_printed) + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + ", "); + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), "%s", + s_reg_fifo_error_strs[j]); + err_printed = true; + } + + results_offset += + sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n"); + } + + results_offset += sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "fifo contained %d elements", num_elements); + + /* Add 1 for string NULL termination */ + *parsed_results_bytes = results_offset + 1; + return DBG_STATUS_OK; +} + +enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size) +{ + return qed_parse_reg_fifo_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + NULL, results_buf_size); +} + +enum dbg_status qed_print_reg_fifo_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf) +{ + u32 parsed_buf_size; + + return qed_parse_reg_fifo_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + results_buf, &parsed_buf_size); +} + +/* Parses an IGU FIFO dump buffer. + * If result_buf is not NULL, the IGU FIFO results are printed to it. + * In any case, the required results buffer size is assigned to + * parsed_results_bytes. + * The parsing status is returned. + */ +static enum dbg_status qed_parse_igu_fifo_dump(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *parsed_results_bytes) +{ + u32 results_offset = 0, param_num_val, num_section_params, num_elements; + const char *section_name, *param_name, *param_str_val; + struct igu_fifo_element *elements; + char parsed_addr_data[32]; + char parsed_wr_data[256]; + u8 i, j; + + /* Read global_params section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "global_params")) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + + /* Print global params */ + dump_buf += qed_print_section_params(dump_buf, + num_section_params, + results_buf, &results_offset); + + /* Read igu_fifo_data section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "igu_fifo_data")) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + dump_buf += qed_read_param(dump_buf, + ¶m_name, ¶m_str_val, ¶m_num_val); + if (strcmp(param_name, "size")) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + if (param_num_val % IGU_FIFO_ELEMENT_DWORDS) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + num_elements = param_num_val / IGU_FIFO_ELEMENT_DWORDS; + elements = (struct igu_fifo_element *)dump_buf; + + /* Decode elements */ + for (i = 0; i < num_elements; i++) { + /* dword12 (dword index 1 and 2) contains bits 32..95 of the + * FIFO element. + */ + u64 dword12 = + ((u64)elements[i].dword2 << 32) | elements[i].dword1; + bool is_wr_cmd = GET_FIELD(dword12, + IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD); + bool is_pf = GET_FIELD(elements[i].dword0, + IGU_FIFO_ELEMENT_DWORD0_IS_PF); + u16 cmd_addr = GET_FIELD(elements[i].dword0, + IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR); + u8 source = GET_FIELD(elements[i].dword0, + IGU_FIFO_ELEMENT_DWORD0_SOURCE); + u8 err_type = GET_FIELD(elements[i].dword0, + IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE); + const struct igu_fifo_addr_data *addr_data = NULL; + + if (source >= ARRAY_SIZE(s_igu_fifo_source_strs)) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + if (err_type >= ARRAY_SIZE(s_igu_fifo_error_strs)) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + + /* Find address data */ + for (j = 0; j < ARRAY_SIZE(s_igu_fifo_addr_data) && !addr_data; + j++) + if (cmd_addr >= s_igu_fifo_addr_data[j].start_addr && + cmd_addr <= s_igu_fifo_addr_data[j].end_addr) + addr_data = &s_igu_fifo_addr_data[j]; + if (!addr_data) + return DBG_STATUS_IGU_FIFO_BAD_DATA; + + /* Prepare parsed address data */ + switch (addr_data->type) { + case IGU_ADDR_TYPE_MSIX_MEM: + sprintf(parsed_addr_data, + " vector_num=0x%x", cmd_addr / 2); + break; + case IGU_ADDR_TYPE_WRITE_INT_ACK: + case IGU_ADDR_TYPE_WRITE_PROD_UPDATE: + sprintf(parsed_addr_data, + " SB=0x%x", cmd_addr - addr_data->start_addr); + break; + default: + parsed_addr_data[0] = '\0'; + } + + /* Prepare parsed write data */ + if (is_wr_cmd) { + u32 wr_data = GET_FIELD(dword12, + IGU_FIFO_ELEMENT_DWORD12_WR_DATA); + u32 prod_cons = GET_FIELD(wr_data, + IGU_FIFO_WR_DATA_PROD_CONS); + u8 is_cleanup = GET_FIELD(wr_data, + IGU_FIFO_WR_DATA_CMD_TYPE); + + if (source == IGU_SRC_ATTN) { + sprintf(parsed_wr_data, + "prod: 0x%x, ", prod_cons); + } else { + if (is_cleanup) { + u8 cleanup_val = GET_FIELD(wr_data, + IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL); + u8 cleanup_type = GET_FIELD(wr_data, + IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE); + + sprintf(parsed_wr_data, + "cmd_type: cleanup, cleanup_val: %s, cleanup_type: %d, ", + cleanup_val ? "set" : "clear", + cleanup_type); + } else { + u8 update_flag = GET_FIELD(wr_data, + IGU_FIFO_WR_DATA_UPDATE_FLAG); + u8 en_dis_int_for_sb = + GET_FIELD(wr_data, + IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB); + u8 segment = GET_FIELD(wr_data, + IGU_FIFO_WR_DATA_SEGMENT); + u8 timer_mask = GET_FIELD(wr_data, + IGU_FIFO_WR_DATA_TIMER_MASK); + + sprintf(parsed_wr_data, + "cmd_type: prod/cons update, prod/cons: 0x%x, update_flag: %s, en_dis_int_for_sb: %s, segment: %s, timer_mask=%d, ", + prod_cons, + update_flag ? "update" : "nop", + en_dis_int_for_sb + ? (en_dis_int_for_sb == + 1 ? "disable" : "nop") : + "enable", + segment ? "attn" : "regular", + timer_mask); + } + } + } else { + parsed_wr_data[0] = '\0'; + } + + /* Add parsed element to parsed buffer */ + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "raw: 0x%01x%08x%08x, %s: %d, source: %s, type: %s, cmd_addr: 0x%x (%s%s), %serror: %s\n", + elements[i].dword2, elements[i].dword1, + elements[i].dword0, + is_pf ? "pf" : "vf", + GET_FIELD(elements[i].dword0, + IGU_FIFO_ELEMENT_DWORD0_FID), + s_igu_fifo_source_strs[source], + is_wr_cmd ? "wr" : "rd", cmd_addr, + (!is_pf && addr_data->vf_desc) + ? addr_data->vf_desc : addr_data->desc, + parsed_addr_data, parsed_wr_data, + s_igu_fifo_error_strs[err_type]); + } + + results_offset += sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "fifo contained %d elements", num_elements); + + /* Add 1 for string NULL termination */ + *parsed_results_bytes = results_offset + 1; + return DBG_STATUS_OK; +} + +enum dbg_status qed_get_igu_fifo_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size) +{ + return qed_parse_igu_fifo_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + NULL, results_buf_size); +} + +enum dbg_status qed_print_igu_fifo_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf) +{ + u32 parsed_buf_size; + + return qed_parse_igu_fifo_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + results_buf, &parsed_buf_size); +} + +static enum dbg_status +qed_parse_protection_override_dump(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *parsed_results_bytes) +{ + u32 results_offset = 0, param_num_val, num_section_params, num_elements; + const char *section_name, *param_name, *param_str_val; + struct protection_override_element *elements; + u8 i; + + /* Read global_params section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "global_params")) + return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA; + + /* Print global params */ + dump_buf += qed_print_section_params(dump_buf, + num_section_params, + results_buf, &results_offset); + + /* Read protection_override_data section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "protection_override_data")) + return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA; + dump_buf += qed_read_param(dump_buf, + ¶m_name, ¶m_str_val, ¶m_num_val); + if (strcmp(param_name, "size")) + return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA; + if (param_num_val % PROTECTION_OVERRIDE_ELEMENT_DWORDS != 0) + return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA; + num_elements = param_num_val / PROTECTION_OVERRIDE_ELEMENT_DWORDS; + elements = (struct protection_override_element *)dump_buf; + + /* Decode elements */ + for (i = 0; i < num_elements; i++) { + u32 address = GET_FIELD(elements[i].data, + PROTECTION_OVERRIDE_ELEMENT_ADDRESS) * + PROTECTION_OVERRIDE_ELEMENT_ADDR_FACTOR; + + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "window %2d, address: 0x%07x, size: %7lld regs, read: %lld, write: %lld, read protection: %-12s, write protection: %-12s\n", + i, address, + GET_FIELD(elements[i].data, + PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE), + GET_FIELD(elements[i].data, + PROTECTION_OVERRIDE_ELEMENT_READ), + GET_FIELD(elements[i].data, + PROTECTION_OVERRIDE_ELEMENT_WRITE), + s_protection_strs[GET_FIELD(elements[i].data, + PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION)], + s_protection_strs[GET_FIELD(elements[i].data, + PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION)]); + } + + results_offset += sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "protection override contained %d elements", + num_elements); + + /* Add 1 for string NULL termination */ + *parsed_results_bytes = results_offset + 1; + return DBG_STATUS_OK; +} + +enum dbg_status +qed_get_protection_override_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size) +{ + return qed_parse_protection_override_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + NULL, results_buf_size); +} + +enum dbg_status qed_print_protection_override_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf) +{ + u32 parsed_buf_size; + + return qed_parse_protection_override_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + results_buf, + &parsed_buf_size); +} + +/* Parses a FW Asserts dump buffer. + * If result_buf is not NULL, the FW Asserts results are printed to it. + * In any case, the required results buffer size is assigned to + * parsed_results_bytes. + * The parsing status is returned. + */ +static enum dbg_status qed_parse_fw_asserts_dump(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *parsed_results_bytes) +{ + u32 results_offset = 0, num_section_params, param_num_val, i; + const char *param_name, *param_str_val, *section_name; + bool last_section_found = false; + + *parsed_results_bytes = 0; + + /* Read global_params section */ + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, &num_section_params); + if (strcmp(section_name, "global_params")) + return DBG_STATUS_FW_ASSERTS_PARSE_FAILED; + + /* Print global params */ + dump_buf += qed_print_section_params(dump_buf, + num_section_params, + results_buf, &results_offset); + while (!last_section_found) { + const char *storm_letter = NULL; + u32 storm_dump_size = 0; + + dump_buf += qed_read_section_hdr(dump_buf, + §ion_name, + &num_section_params); + if (!strcmp(section_name, "last")) { + last_section_found = true; + continue; + } else if (strcmp(section_name, "fw_asserts")) { + return DBG_STATUS_FW_ASSERTS_PARSE_FAILED; + } + + /* Extract params */ + for (i = 0; i < num_section_params; i++) { + dump_buf += qed_read_param(dump_buf, + ¶m_name, + ¶m_str_val, + ¶m_num_val); + if (!strcmp(param_name, "storm")) + storm_letter = param_str_val; + else if (!strcmp(param_name, "size")) + storm_dump_size = param_num_val; + else + return DBG_STATUS_FW_ASSERTS_PARSE_FAILED; + } + + if (!storm_letter || !storm_dump_size) + return DBG_STATUS_FW_ASSERTS_PARSE_FAILED; + + /* Print data */ + results_offset += sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "\n%sSTORM_ASSERT: size=%d\n", + storm_letter, storm_dump_size); + for (i = 0; i < storm_dump_size; i++, dump_buf++) + results_offset += + sprintf(qed_get_buf_ptr(results_buf, + results_offset), + "%08x\n", *dump_buf); + } + + /* Add 1 for string NULL termination */ + *parsed_results_bytes = results_offset + 1; + return DBG_STATUS_OK; +} + +enum dbg_status qed_get_fw_asserts_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size) +{ + return qed_parse_fw_asserts_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + NULL, results_buf_size); +} + +enum dbg_status qed_print_fw_asserts_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf) +{ + u32 parsed_buf_size; + + return qed_parse_fw_asserts_dump(p_hwfn, + dump_buf, + num_dumped_dwords, + results_buf, &parsed_buf_size); +} + +/* Wrapper for unifying the idle_chk and mcp_trace api */ +enum dbg_status qed_print_idle_chk_results_wrapper(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf) +{ + u32 num_errors, num_warnnings; + + return qed_print_idle_chk_results(p_hwfn, dump_buf, num_dumped_dwords, + results_buf, &num_errors, + &num_warnnings); +} + +/* Feature meta data lookup table */ +static struct { + char *name; + enum dbg_status (*get_size)(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *size); + enum dbg_status (*perform_dump)(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 *dump_buf, + u32 buf_size, u32 *dumped_dwords); + enum dbg_status (*print_results)(struct qed_hwfn *p_hwfn, + u32 *dump_buf, u32 num_dumped_dwords, + char *results_buf); + enum dbg_status (*results_buf_size)(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +} qed_features_lookup[] = { + { + "grc", qed_dbg_grc_get_dump_buf_size, + qed_dbg_grc_dump, NULL, NULL}, { + "idle_chk", + qed_dbg_idle_chk_get_dump_buf_size, + qed_dbg_idle_chk_dump, + qed_print_idle_chk_results_wrapper, + qed_get_idle_chk_results_buf_size}, { + "mcp_trace", + qed_dbg_mcp_trace_get_dump_buf_size, + qed_dbg_mcp_trace_dump, qed_print_mcp_trace_results, + qed_get_mcp_trace_results_buf_size}, { + "reg_fifo", + qed_dbg_reg_fifo_get_dump_buf_size, + qed_dbg_reg_fifo_dump, qed_print_reg_fifo_results, + qed_get_reg_fifo_results_buf_size}, { + "igu_fifo", + qed_dbg_igu_fifo_get_dump_buf_size, + qed_dbg_igu_fifo_dump, qed_print_igu_fifo_results, + qed_get_igu_fifo_results_buf_size}, { + "protection_override", + qed_dbg_protection_override_get_dump_buf_size, + qed_dbg_protection_override_dump, + qed_print_protection_override_results, + qed_get_protection_override_results_buf_size}, { + "fw_asserts", + qed_dbg_fw_asserts_get_dump_buf_size, + qed_dbg_fw_asserts_dump, + qed_print_fw_asserts_results, + qed_get_fw_asserts_results_buf_size},}; + +static void qed_dbg_print_feature(u8 *p_text_buf, u32 text_size) +{ + u32 i, precision = 80; + + if (!p_text_buf) + return; + + pr_notice("\n%.*s", precision, p_text_buf); + for (i = precision; i < text_size; i += precision) + pr_cont("%.*s", precision, p_text_buf + i); + pr_cont("\n"); +} + +#define QED_RESULTS_BUF_MIN_SIZE 16 +/* Generic function for decoding debug feature info */ +enum dbg_status format_feature(struct qed_hwfn *p_hwfn, + enum qed_dbg_features feature_idx) +{ + struct qed_dbg_feature *feature = + &p_hwfn->cdev->dbg_params.features[feature_idx]; + u32 text_size_bytes, null_char_pos, i; + enum dbg_status rc; + char *text_buf; + + /* Check if feature supports formatting capability */ + if (!qed_features_lookup[feature_idx].results_buf_size) + return DBG_STATUS_OK; + + /* Obtain size of formatted output */ + rc = qed_features_lookup[feature_idx]. + results_buf_size(p_hwfn, (u32 *)feature->dump_buf, + feature->dumped_dwords, &text_size_bytes); + if (rc != DBG_STATUS_OK) + return rc; + + /* Make sure that the allocated size is a multiple of dword (4 bytes) */ + null_char_pos = text_size_bytes - 1; + text_size_bytes = (text_size_bytes + 3) & ~0x3; + + if (text_size_bytes < QED_RESULTS_BUF_MIN_SIZE) { + DP_NOTICE(p_hwfn->cdev, + "formatted size of feature was too small %d. Aborting\n", + text_size_bytes); + return DBG_STATUS_INVALID_ARGS; + } + + /* Allocate temp text buf */ + text_buf = vzalloc(text_size_bytes); + if (!text_buf) + return DBG_STATUS_VIRT_MEM_ALLOC_FAILED; + + /* Decode feature opcodes to string on temp buf */ + rc = qed_features_lookup[feature_idx]. + print_results(p_hwfn, (u32 *)feature->dump_buf, + feature->dumped_dwords, text_buf); + if (rc != DBG_STATUS_OK) { + vfree(text_buf); + return rc; + } + + /* Replace the original null character with a '\n' character. + * The bytes that were added as a result of the dword alignment are also + * padded with '\n' characters. + */ + for (i = null_char_pos; i < text_size_bytes; i++) + text_buf[i] = '\n'; + + /* Dump printable feature to log */ + if (p_hwfn->cdev->dbg_params.print_data) + qed_dbg_print_feature(text_buf, text_size_bytes); + + /* Free the old dump_buf and point the dump_buf to the newly allocagted + * and formatted text buffer. + */ + vfree(feature->dump_buf); + feature->dump_buf = text_buf; + feature->buf_size = text_size_bytes; + feature->dumped_dwords = text_size_bytes / 4; + return rc; +} + +/* Generic function for performing the dump of a debug feature. */ +enum dbg_status qed_dbg_dump(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + enum qed_dbg_features feature_idx) +{ + struct qed_dbg_feature *feature = + &p_hwfn->cdev->dbg_params.features[feature_idx]; + u32 buf_size_dwords; + enum dbg_status rc; + + DP_NOTICE(p_hwfn->cdev, "Collecting a debug feature [\"%s\"]\n", + qed_features_lookup[feature_idx].name); + + /* Dump_buf was already allocated need to free (this can happen if dump + * was called but file was never read). + * We can't use the buffer as is since size may have changed. + */ + if (feature->dump_buf) { + vfree(feature->dump_buf); + feature->dump_buf = NULL; + } + + /* Get buffer size from hsi, allocate accordingly, and perform the + * dump. + */ + rc = qed_features_lookup[feature_idx].get_size(p_hwfn, p_ptt, + &buf_size_dwords); + if (rc != DBG_STATUS_OK) + return rc; + feature->buf_size = buf_size_dwords * sizeof(u32); + feature->dump_buf = vmalloc(feature->buf_size); + if (!feature->dump_buf) + return DBG_STATUS_VIRT_MEM_ALLOC_FAILED; + + rc = qed_features_lookup[feature_idx]. + perform_dump(p_hwfn, p_ptt, (u32 *)feature->dump_buf, + feature->buf_size / sizeof(u32), + &feature->dumped_dwords); + + /* If mcp is stuck we get DBG_STATUS_NVRAM_GET_IMAGE_FAILED error. + * In this case the buffer holds valid binary data, but we wont able + * to parse it (since parsing relies on data in NVRAM which is only + * accessible when MFW is responsive). skip the formatting but return + * success so that binary data is provided. + */ + if (rc == DBG_STATUS_NVRAM_GET_IMAGE_FAILED) + return DBG_STATUS_OK; + + if (rc != DBG_STATUS_OK) + return rc; + + /* Format output */ + rc = format_feature(p_hwfn, feature_idx); + return rc; +} + +int qed_dbg_grc(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_GRC, num_dumped_bytes); +} + +int qed_dbg_grc_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_GRC); +} + +int qed_dbg_idle_chk(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_IDLE_CHK, + num_dumped_bytes); +} + +int qed_dbg_idle_chk_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_IDLE_CHK); +} + +int qed_dbg_reg_fifo(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_REG_FIFO, + num_dumped_bytes); +} + +int qed_dbg_reg_fifo_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_REG_FIFO); +} + +int qed_dbg_igu_fifo(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_IGU_FIFO, + num_dumped_bytes); +} + +int qed_dbg_igu_fifo_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_IGU_FIFO); +} + +int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_PROTECTION_OVERRIDE, + num_dumped_bytes); +} + +int qed_dbg_protection_override_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_PROTECTION_OVERRIDE); +} + +int qed_dbg_fw_asserts(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_FW_ASSERTS, + num_dumped_bytes); +} + +int qed_dbg_fw_asserts_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_FW_ASSERTS); +} + +int qed_dbg_mcp_trace(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes) +{ + return qed_dbg_feature(cdev, buffer, DBG_FEATURE_MCP_TRACE, + num_dumped_bytes); +} + +int qed_dbg_mcp_trace_size(struct qed_dev *cdev) +{ + return qed_dbg_feature_size(cdev, DBG_FEATURE_MCP_TRACE); +} + +/* Defines the amount of bytes allocated for recording the length of debugfs + * feature buffer. + */ +#define REGDUMP_HEADER_SIZE sizeof(u32) +#define REGDUMP_HEADER_FEATURE_SHIFT 24 +#define REGDUMP_HEADER_ENGINE_SHIFT 31 +#define REGDUMP_HEADER_OMIT_ENGINE_SHIFT 30 +enum debug_print_features { + OLD_MODE = 0, + IDLE_CHK = 1, + GRC_DUMP = 2, + MCP_TRACE = 3, + REG_FIFO = 4, + PROTECTION_OVERRIDE = 5, + IGU_FIFO = 6, + PHY = 7, + FW_ASSERTS = 8, +}; + +static u32 qed_calc_regdump_header(enum debug_print_features feature, + int engine, u32 feature_size, u8 omit_engine) +{ + /* Insert the engine, feature and mode inside the header and combine it + * with feature size. + */ + return feature_size | (feature << REGDUMP_HEADER_FEATURE_SHIFT) | + (omit_engine << REGDUMP_HEADER_OMIT_ENGINE_SHIFT) | + (engine << REGDUMP_HEADER_ENGINE_SHIFT); +} + +int qed_dbg_all_data(struct qed_dev *cdev, void *buffer) +{ + u8 cur_engine, omit_engine = 0, org_engine; + u32 offset = 0, feature_size; + int rc; + + if (cdev->num_hwfns == 1) + omit_engine = 1; + + org_engine = qed_get_debug_engine(cdev); + for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) { + /* Collect idle_chks and grcDump for each hw function */ + DP_VERBOSE(cdev, QED_MSG_DEBUG, + "obtaining idle_chk and grcdump for current engine\n"); + qed_set_debug_engine(cdev, cur_engine); + + /* First idle_chk */ + rc = qed_dbg_idle_chk(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(IDLE_CHK, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_idle_chk failed. rc = %d\n", rc); + } + + /* Second idle_chk */ + rc = qed_dbg_idle_chk(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(IDLE_CHK, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_idle_chk failed. rc = %d\n", rc); + } + + /* reg_fifo dump */ + rc = qed_dbg_reg_fifo(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(REG_FIFO, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_reg_fifo failed. rc = %d\n", rc); + } + + /* igu_fifo dump */ + rc = qed_dbg_igu_fifo(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(IGU_FIFO, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_igu_fifo failed. rc = %d", rc); + } + + /* protection_override dump */ + rc = qed_dbg_protection_override(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, + &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(PROTECTION_OVERRIDE, + cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, + "qed_dbg_protection_override failed. rc = %d\n", + rc); + } + + /* fw_asserts dump */ + rc = qed_dbg_fw_asserts(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(FW_ASSERTS, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_fw_asserts failed. rc = %d\n", + rc); + } + + /* GRC dump - must be last because when mcp stuck it will + * clutter idle_chk, reg_fifo, ... + */ + rc = qed_dbg_grc(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(GRC_DUMP, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_grc failed. rc = %d", rc); + } + } + + /* mcp_trace */ + rc = qed_dbg_mcp_trace(cdev, (u8 *)buffer + offset + + REGDUMP_HEADER_SIZE, &feature_size); + if (!rc) { + *(u32 *)((u8 *)buffer + offset) = + qed_calc_regdump_header(MCP_TRACE, cur_engine, + feature_size, omit_engine); + offset += (feature_size + REGDUMP_HEADER_SIZE); + } else { + DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc); + } + + qed_set_debug_engine(cdev, org_engine); + + return 0; +} + +int qed_dbg_all_data_size(struct qed_dev *cdev) +{ + u8 cur_engine, org_engine; + u32 regs_len = 0; + + org_engine = qed_get_debug_engine(cdev); + for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) { + /* Engine specific */ + DP_VERBOSE(cdev, QED_MSG_DEBUG, + "calculating idle_chk and grcdump register length for current engine\n"); + qed_set_debug_engine(cdev, cur_engine); + regs_len += REGDUMP_HEADER_SIZE + qed_dbg_idle_chk_size(cdev) + + REGDUMP_HEADER_SIZE + qed_dbg_idle_chk_size(cdev) + + REGDUMP_HEADER_SIZE + qed_dbg_grc_size(cdev) + + REGDUMP_HEADER_SIZE + qed_dbg_reg_fifo_size(cdev) + + REGDUMP_HEADER_SIZE + qed_dbg_igu_fifo_size(cdev) + + REGDUMP_HEADER_SIZE + + qed_dbg_protection_override_size(cdev) + + REGDUMP_HEADER_SIZE + qed_dbg_fw_asserts_size(cdev); + } + + /* Engine common */ + regs_len += REGDUMP_HEADER_SIZE + qed_dbg_mcp_trace_size(cdev); + qed_set_debug_engine(cdev, org_engine); + + return regs_len; +} + +int qed_dbg_feature(struct qed_dev *cdev, void *buffer, + enum qed_dbg_features feature, u32 *num_dumped_bytes) +{ + struct qed_hwfn *p_hwfn = + &cdev->hwfns[cdev->dbg_params.engine_for_debug]; + struct qed_dbg_feature *qed_feature = + &cdev->dbg_params.features[feature]; + enum dbg_status dbg_rc; + struct qed_ptt *p_ptt; + int rc = 0; + + /* Acquire ptt */ + p_ptt = qed_ptt_acquire(p_hwfn); + if (!p_ptt) + return -EINVAL; + + /* Get dump */ + dbg_rc = qed_dbg_dump(p_hwfn, p_ptt, feature); + if (dbg_rc != DBG_STATUS_OK) { + DP_VERBOSE(cdev, QED_MSG_DEBUG, "%s\n", + qed_dbg_get_status_str(dbg_rc)); + *num_dumped_bytes = 0; + rc = -EINVAL; + goto out; + } + + DP_VERBOSE(cdev, QED_MSG_DEBUG, + "copying debugfs feature to external buffer\n"); + memcpy(buffer, qed_feature->dump_buf, qed_feature->buf_size); + *num_dumped_bytes = cdev->dbg_params.features[feature].dumped_dwords * + 4; + +out: + qed_ptt_release(p_hwfn, p_ptt); + return rc; +} + +int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature) +{ + struct qed_hwfn *p_hwfn = + &cdev->hwfns[cdev->dbg_params.engine_for_debug]; + struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn); + struct qed_dbg_feature *qed_feature = + &cdev->dbg_params.features[feature]; + u32 buf_size_dwords; + enum dbg_status rc; + + if (!p_ptt) + return -EINVAL; + + rc = qed_features_lookup[feature].get_size(p_hwfn, p_ptt, + &buf_size_dwords); + if (rc != DBG_STATUS_OK) + buf_size_dwords = 0; + + qed_ptt_release(p_hwfn, p_ptt); + qed_feature->buf_size = buf_size_dwords * sizeof(u32); + return qed_feature->buf_size; +} + +u8 qed_get_debug_engine(struct qed_dev *cdev) +{ + return cdev->dbg_params.engine_for_debug; +} + +void qed_set_debug_engine(struct qed_dev *cdev, int engine_number) +{ + DP_VERBOSE(cdev, QED_MSG_DEBUG, "set debug engine to %d\n", + engine_number); + cdev->dbg_params.engine_for_debug = engine_number; +} + +void qed_dbg_pf_init(struct qed_dev *cdev) +{ + const u8 *dbg_values; + + /* Debug values are after init values. + * The offset is the first dword of the file. + */ + dbg_values = cdev->firmware->data + *(u32 *)cdev->firmware->data; + qed_dbg_set_bin_ptr((u8 *)dbg_values); + qed_dbg_user_set_bin_ptr((u8 *)dbg_values); +} + +void qed_dbg_pf_exit(struct qed_dev *cdev) +{ + struct qed_dbg_feature *feature = NULL; + enum qed_dbg_features feature_idx; + + /* Debug features' buffers may be allocated if debug feature was used + * but dump wasn't called. + */ + for (feature_idx = 0; feature_idx < DBG_FEATURE_NUM; feature_idx++) { + feature = &cdev->dbg_params.features[feature_idx]; + if (feature->dump_buf) { + vfree(feature->dump_buf); + feature->dump_buf = NULL; + } + } +} diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.h b/drivers/net/ethernet/qlogic/qed/qed_debug.h new file mode 100644 index 000000000000..f872d7324814 --- /dev/null +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.h @@ -0,0 +1,54 @@ +/* QLogic qed NIC Driver + * Copyright (c) 2015 QLogic Corporation + * + * This software is available under the terms of the GNU General Public License + * (GPL) Version 2, available from the file COPYING in the main directory of + * this source tree. + */ + +#ifndef _QED_DEBUGFS_H +#define _QED_DEBUGFS_H + +enum qed_dbg_features { + DBG_FEATURE_GRC, + DBG_FEATURE_IDLE_CHK, + DBG_FEATURE_MCP_TRACE, + DBG_FEATURE_REG_FIFO, + DBG_FEATURE_IGU_FIFO, + DBG_FEATURE_PROTECTION_OVERRIDE, + DBG_FEATURE_FW_ASSERTS, + DBG_FEATURE_NUM +}; + +int qed_dbg_grc(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes); +int qed_dbg_grc_size(struct qed_dev *cdev); +int qed_dbg_idle_chk(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes); +int qed_dbg_idle_chk_size(struct qed_dev *cdev); +int qed_dbg_reg_fifo(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes); +int qed_dbg_reg_fifo_size(struct qed_dev *cdev); +int qed_dbg_igu_fifo(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes); +int qed_dbg_igu_fifo_size(struct qed_dev *cdev); +int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes); +int qed_dbg_protection_override_size(struct qed_dev *cdev); +int qed_dbg_fw_asserts(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes); +int qed_dbg_fw_asserts_size(struct qed_dev *cdev); +int qed_dbg_mcp_trace(struct qed_dev *cdev, void *buffer, + u32 *num_dumped_bytes); +int qed_dbg_mcp_trace_size(struct qed_dev *cdev); +int qed_dbg_all_data(struct qed_dev *cdev, void *buffer); +int qed_dbg_all_data_size(struct qed_dev *cdev); +u8 qed_get_debug_engine(struct qed_dev *cdev); +void qed_set_debug_engine(struct qed_dev *cdev, int engine_number); +int qed_dbg_feature(struct qed_dev *cdev, void *buffer, + enum qed_dbg_features feature, u32 *num_dumped_bytes); +int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature); + +void qed_dbg_pf_init(struct qed_dev *cdev); +void qed_dbg_pf_exit(struct qed_dev *cdev); + +#endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 855691f18412..2777d5bb4380 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -1728,13 +1728,6 @@ enum bin_dbg_buffer_type { MAX_BIN_DBG_BUFFER_TYPE }; -/* Chip IDs */ -enum chip_ids { - CHIP_RESERVED, - CHIP_BB_B0, - CHIP_RESERVED2, - MAX_CHIP_IDS -}; /* Attention bit mapping */ struct dbg_attn_bit_mapping { @@ -1813,6 +1806,371 @@ enum dbg_attn_type { MAX_DBG_ATTN_TYPE }; +/* condition header for registers dump */ +struct dbg_dump_cond_hdr { + struct dbg_mode_hdr mode; /* Mode header */ + u8 block_id; /* block ID */ + u8 data_size; /* size in dwords of the data following this header */ +}; + +/* memory data for registers dump */ +struct dbg_dump_mem { + __le32 dword0; +#define DBG_DUMP_MEM_ADDRESS_MASK 0xFFFFFF +#define DBG_DUMP_MEM_ADDRESS_SHIFT 0 +#define DBG_DUMP_MEM_MEM_GROUP_ID_MASK 0xFF +#define DBG_DUMP_MEM_MEM_GROUP_ID_SHIFT 24 + __le32 dword1; +#define DBG_DUMP_MEM_LENGTH_MASK 0xFFFFFF +#define DBG_DUMP_MEM_LENGTH_SHIFT 0 +#define DBG_DUMP_MEM_RESERVED_MASK 0xFF +#define DBG_DUMP_MEM_RESERVED_SHIFT 24 +}; + +/* register data for registers dump */ +struct dbg_dump_reg { + __le32 data; +#define DBG_DUMP_REG_ADDRESS_MASK 0xFFFFFF /* register address (in dwords) */ +#define DBG_DUMP_REG_ADDRESS_SHIFT 0 +#define DBG_DUMP_REG_LENGTH_MASK 0xFF /* register size (in dwords) */ +#define DBG_DUMP_REG_LENGTH_SHIFT 24 +}; + +/* split header for registers dump */ +struct dbg_dump_split_hdr { + __le32 hdr; +#define DBG_DUMP_SPLIT_HDR_DATA_SIZE_MASK 0xFFFFFF +#define DBG_DUMP_SPLIT_HDR_DATA_SIZE_SHIFT 0 +#define DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID_MASK 0xFF +#define DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID_SHIFT 24 +}; + +/* condition header for idle check */ +struct dbg_idle_chk_cond_hdr { + struct dbg_mode_hdr mode; /* Mode header */ + __le16 data_size; /* size in dwords of the data following this header */ +}; + +/* Idle Check condition register */ +struct dbg_idle_chk_cond_reg { + __le32 data; +#define DBG_IDLE_CHK_COND_REG_ADDRESS_MASK 0xFFFFFF +#define DBG_IDLE_CHK_COND_REG_ADDRESS_SHIFT 0 +#define DBG_IDLE_CHK_COND_REG_BLOCK_ID_MASK 0xFF +#define DBG_IDLE_CHK_COND_REG_BLOCK_ID_SHIFT 24 + __le16 num_entries; /* number of registers entries to check */ + u8 entry_size; /* size of registers entry (in dwords) */ + u8 start_entry; /* index of the first entry to check */ +}; + +/* Idle Check info register */ +struct dbg_idle_chk_info_reg { + __le32 data; +#define DBG_IDLE_CHK_INFO_REG_ADDRESS_MASK 0xFFFFFF +#define DBG_IDLE_CHK_INFO_REG_ADDRESS_SHIFT 0 +#define DBG_IDLE_CHK_INFO_REG_BLOCK_ID_MASK 0xFF +#define DBG_IDLE_CHK_INFO_REG_BLOCK_ID_SHIFT 24 + __le16 size; /* register size in dwords */ + struct dbg_mode_hdr mode; /* Mode header */ +}; + +/* Idle Check register */ +union dbg_idle_chk_reg { + struct dbg_idle_chk_cond_reg cond_reg; /* condition register */ + struct dbg_idle_chk_info_reg info_reg; /* info register */ +}; + +/* Idle Check result header */ +struct dbg_idle_chk_result_hdr { + __le16 rule_id; /* Failing rule index */ + __le16 mem_entry_id; /* Failing memory entry index */ + u8 num_dumped_cond_regs; /* number of dumped condition registers */ + u8 num_dumped_info_regs; /* number of dumped condition registers */ + u8 severity; /* from dbg_idle_chk_severity_types enum */ + u8 reserved; +}; + +/* Idle Check result register header */ +struct dbg_idle_chk_result_reg_hdr { + u8 data; +#define DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM_MASK 0x1 +#define DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM_SHIFT 0 +#define DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID_MASK 0x7F +#define DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID_SHIFT 1 + u8 start_entry; /* index of the first checked entry */ + __le16 size; /* register size in dwords */ +}; + +/* Idle Check rule */ +struct dbg_idle_chk_rule { + __le16 rule_id; /* Idle Check rule ID */ + u8 severity; /* value from dbg_idle_chk_severity_types enum */ + u8 cond_id; /* Condition ID */ + u8 num_cond_regs; /* number of condition registers */ + u8 num_info_regs; /* number of info registers */ + u8 num_imms; /* number of immediates in the condition */ + u8 reserved1; + __le16 reg_offset; /* offset of this rules registers in the idle check + * register array (in dbg_idle_chk_reg units). + */ + __le16 imm_offset; /* offset of this rules immediate values in the + * immediate values array (in dwords). + */ +}; + +/* Idle Check rule parsing data */ +struct dbg_idle_chk_rule_parsing_data { + __le32 data; +#define DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG_MASK 0x1 +#define DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG_SHIFT 0 +#define DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET_MASK 0x7FFFFFFF +#define DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET_SHIFT 1 +}; + +/* idle check severity types */ +enum dbg_idle_chk_severity_types { + /* idle check failure should cause an error */ + IDLE_CHK_SEVERITY_ERROR, + /* idle check failure should cause an error only if theres no traffic */ + IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC, + /* idle check failure should cause a warning */ + IDLE_CHK_SEVERITY_WARNING, + MAX_DBG_IDLE_CHK_SEVERITY_TYPES +}; + +/* Debug Bus block data */ +struct dbg_bus_block_data { + u8 enabled; /* Indicates if the block is enabled for recording (0/1) */ + u8 hw_id; /* HW ID associated with the block */ + u8 line_num; /* Debug line number to select */ + u8 right_shift; /* Number of units to right the debug data (0-3) */ + u8 cycle_en; /* 4-bit value: bit i set -> unit i is enabled. */ + u8 force_valid; /* 4-bit value: bit i set -> unit i is forced valid. */ + u8 force_frame; /* 4-bit value: bit i set -> unit i frame bit is forced. + */ + u8 reserved; +}; + +/* Debug Bus Clients */ +enum dbg_bus_clients { + DBG_BUS_CLIENT_RBCN, + DBG_BUS_CLIENT_RBCP, + DBG_BUS_CLIENT_RBCR, + DBG_BUS_CLIENT_RBCT, + DBG_BUS_CLIENT_RBCU, + DBG_BUS_CLIENT_RBCF, + DBG_BUS_CLIENT_RBCX, + DBG_BUS_CLIENT_RBCS, + DBG_BUS_CLIENT_RBCH, + DBG_BUS_CLIENT_RBCZ, + DBG_BUS_CLIENT_OTHER_ENGINE, + DBG_BUS_CLIENT_TIMESTAMP, + DBG_BUS_CLIENT_CPU, + DBG_BUS_CLIENT_RBCY, + DBG_BUS_CLIENT_RBCQ, + DBG_BUS_CLIENT_RBCM, + DBG_BUS_CLIENT_RBCB, + DBG_BUS_CLIENT_RBCW, + DBG_BUS_CLIENT_RBCV, + MAX_DBG_BUS_CLIENTS +}; + +/* Debug Bus memory address */ +struct dbg_bus_mem_addr { + __le32 lo; + __le32 hi; +}; + +/* Debug Bus PCI buffer data */ +struct dbg_bus_pci_buf_data { + struct dbg_bus_mem_addr phys_addr; /* PCI buffer physical address */ + struct dbg_bus_mem_addr virt_addr; /* PCI buffer virtual address */ + __le32 size; /* PCI buffer size in bytes */ +}; + +/* Debug Bus Storm EID range filter params */ +struct dbg_bus_storm_eid_range_params { + u8 min; /* Minimal event ID to filter on */ + u8 max; /* Maximal event ID to filter on */ +}; + +/* Debug Bus Storm EID mask filter params */ +struct dbg_bus_storm_eid_mask_params { + u8 val; /* Event ID value */ + u8 mask; /* Event ID mask. 1s in the mask = dont care bits. */ +}; + +/* Debug Bus Storm EID filter params */ +union dbg_bus_storm_eid_params { + struct dbg_bus_storm_eid_range_params range; + struct dbg_bus_storm_eid_mask_params mask; +}; + +/* Debug Bus Storm data */ +struct dbg_bus_storm_data { + u8 fast_enabled; + u8 fast_mode; + u8 slow_enabled; + u8 slow_mode; + u8 hw_id; + u8 eid_filter_en; + u8 eid_range_not_mask; + u8 cid_filter_en; + union dbg_bus_storm_eid_params eid_filter_params; + __le16 reserved; + __le32 cid; +}; + +/* Debug Bus data */ +struct dbg_bus_data { + __le32 app_version; /* The tools version number of the application */ + u8 state; /* The current debug bus state */ + u8 hw_dwords; /* HW dwords per cycle */ + u8 next_hw_id; /* Next HW ID to be associated with an input */ + u8 num_enabled_blocks; /* Number of blocks enabled for recording */ + u8 num_enabled_storms; /* Number of Storms enabled for recording */ + u8 target; /* Output target */ + u8 next_trigger_state; /* ID of next trigger state to be added */ + u8 next_constraint_id; /* ID of next filter/trigger constraint to be + * added. + */ + u8 one_shot_en; /* Indicates if one-shot mode is enabled (0/1) */ + u8 grc_input_en; /* Indicates if GRC recording is enabled (0/1) */ + u8 timestamp_input_en; /* Indicates if timestamp recording is enabled + * (0/1). + */ + u8 filter_en; /* Indicates if the recording filter is enabled (0/1) */ + u8 trigger_en; /* Indicates if the recording trigger is enabled (0/1) */ + u8 adding_filter; /* If true, the next added constraint belong to the + * filter. Otherwise, it belongs to the last added + * trigger state. Valid only if either filter or + * triggers are enabled. + */ + u8 filter_pre_trigger; /* Indicates if the recording filter should be + * applied before the trigger. Valid only if both + * filter and trigger are enabled (0/1). + */ + u8 filter_post_trigger; /* Indicates if the recording filter should be + * applied after the trigger. Valid only if both + * filter and trigger are enabled (0/1). + */ + u8 unify_inputs; /* If true, all inputs are associated with HW ID 0. + * Otherwise, each input is assigned a different HW ID + * (0/1). + */ + u8 rcv_from_other_engine; /* Indicates if the other engine sends it NW + * recording to this engine (0/1). + */ + struct dbg_bus_pci_buf_data pci_buf; /* Debug Bus PCI buffer data. Valid + * only when the target is + * DBG_BUS_TARGET_ID_PCI. + */ + __le16 reserved; + struct dbg_bus_block_data blocks[80];/* Debug Bus data for each block */ + struct dbg_bus_storm_data storms[6]; /* Debug Bus data for each block */ +}; + +/* Debug bus frame modes */ +enum dbg_bus_frame_modes { + DBG_BUS_FRAME_MODE_0HW_4ST = 0, /* 0 HW dwords, 4 Storm dwords */ + DBG_BUS_FRAME_MODE_4HW_0ST = 3, /* 4 HW dwords, 0 Storm dwords */ + DBG_BUS_FRAME_MODE_8HW_0ST = 4, /* 8 HW dwords, 0 Storm dwords */ + MAX_DBG_BUS_FRAME_MODES +}; + +/* Debug bus states */ +enum dbg_bus_states { + DBG_BUS_STATE_IDLE, /* debug bus idle state (not recording) */ + DBG_BUS_STATE_READY, /* debug bus is ready for configuration and + * recording. + */ + DBG_BUS_STATE_RECORDING, /* debug bus is currently recording */ + DBG_BUS_STATE_STOPPED, /* debug bus recording has stopped */ + MAX_DBG_BUS_STATES +}; + +/* Debug bus target IDs */ +enum dbg_bus_targets { + /* records debug bus to DBG block internal buffer */ + DBG_BUS_TARGET_ID_INT_BUF, + /* records debug bus to the NW */ + DBG_BUS_TARGET_ID_NIG, + /* records debug bus to a PCI buffer */ + DBG_BUS_TARGET_ID_PCI, + MAX_DBG_BUS_TARGETS +}; + +/* GRC Dump data */ +struct dbg_grc_data { + __le32 param_val[40]; /* Value of each GRC parameter. Array size must + * match the enum dbg_grc_params. + */ + u8 param_set_by_user[40]; /* Indicates for each GRC parameter if it was + * set by the user (0/1). Array size must + * match the enum dbg_grc_params. + */ +}; + +/* Debug GRC params */ +enum dbg_grc_params { + DBG_GRC_PARAM_DUMP_TSTORM, /* dump Tstorm memories (0/1) */ + DBG_GRC_PARAM_DUMP_MSTORM, /* dump Mstorm memories (0/1) */ + DBG_GRC_PARAM_DUMP_USTORM, /* dump Ustorm memories (0/1) */ + DBG_GRC_PARAM_DUMP_XSTORM, /* dump Xstorm memories (0/1) */ + DBG_GRC_PARAM_DUMP_YSTORM, /* dump Ystorm memories (0/1) */ + DBG_GRC_PARAM_DUMP_PSTORM, /* dump Pstorm memories (0/1) */ + DBG_GRC_PARAM_DUMP_REGS, /* dump non-memory registers (0/1) */ + DBG_GRC_PARAM_DUMP_RAM, /* dump Storm internal RAMs (0/1) */ + DBG_GRC_PARAM_DUMP_PBUF, /* dump Storm passive buffer (0/1) */ + DBG_GRC_PARAM_DUMP_IOR, /* dump Storm IORs (0/1) */ + DBG_GRC_PARAM_DUMP_VFC, /* dump VFC memories (0/1) */ + DBG_GRC_PARAM_DUMP_CM_CTX, /* dump CM contexts (0/1) */ + DBG_GRC_PARAM_DUMP_PXP, /* dump PXP memories (0/1) */ + DBG_GRC_PARAM_DUMP_RSS, /* dump RSS memories (0/1) */ + DBG_GRC_PARAM_DUMP_CAU, /* dump CAU memories (0/1) */ + DBG_GRC_PARAM_DUMP_QM, /* dump QM memories (0/1) */ + DBG_GRC_PARAM_DUMP_MCP, /* dump MCP memories (0/1) */ + DBG_GRC_PARAM_RESERVED, /* reserved */ + DBG_GRC_PARAM_DUMP_CFC, /* dump CFC memories (0/1) */ + DBG_GRC_PARAM_DUMP_IGU, /* dump IGU memories (0/1) */ + DBG_GRC_PARAM_DUMP_BRB, /* dump BRB memories (0/1) */ + DBG_GRC_PARAM_DUMP_BTB, /* dump BTB memories (0/1) */ + DBG_GRC_PARAM_DUMP_BMB, /* dump BMB memories (0/1) */ + DBG_GRC_PARAM_DUMP_NIG, /* dump NIG memories (0/1) */ + DBG_GRC_PARAM_DUMP_MULD, /* dump MULD memories (0/1) */ + DBG_GRC_PARAM_DUMP_PRS, /* dump PRS memories (0/1) */ + DBG_GRC_PARAM_DUMP_DMAE, /* dump PRS memories (0/1) */ + DBG_GRC_PARAM_DUMP_TM, /* dump TM (timers) memories (0/1) */ + DBG_GRC_PARAM_DUMP_SDM, /* dump SDM memories (0/1) */ + DBG_GRC_PARAM_DUMP_DIF, /* dump DIF memories (0/1) */ + DBG_GRC_PARAM_DUMP_STATIC, /* dump static debug data (0/1) */ + DBG_GRC_PARAM_UNSTALL, /* un-stall Storms after dump (0/1) */ + DBG_GRC_PARAM_NUM_LCIDS, /* number of LCIDs (0..320) */ + DBG_GRC_PARAM_NUM_LTIDS, /* number of LTIDs (0..320) */ + /* preset: exclude all memories from dump (1 only) */ + DBG_GRC_PARAM_EXCLUDE_ALL, + /* preset: include memories for crash dump (1 only) */ + DBG_GRC_PARAM_CRASH, + /* perform dump only if MFW is responding (0/1) */ + DBG_GRC_PARAM_PARITY_SAFE, + DBG_GRC_PARAM_DUMP_CM, /* dump CM memories (0/1) */ + DBG_GRC_PARAM_DUMP_PHY, /* dump PHY memories (0/1) */ + MAX_DBG_GRC_PARAMS +}; + +/* Debug reset registers */ +enum dbg_reset_regs { + DBG_RESET_REG_MISCS_PL_UA, + DBG_RESET_REG_MISCS_PL_HV, + DBG_RESET_REG_MISCS_PL_HV_2, + DBG_RESET_REG_MISC_PL_UA, + DBG_RESET_REG_MISC_PL_HV, + DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, + DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, + DBG_RESET_REG_MISC_PL_PDA_VAUX, + MAX_DBG_RESET_REGS +}; + /* Debug status codes */ enum dbg_status { DBG_STATUS_OK, @@ -1869,6 +2227,41 @@ enum dbg_status { MAX_DBG_STATUS }; +/* Debug Storms IDs */ +enum dbg_storms { + DBG_TSTORM_ID, + DBG_MSTORM_ID, + DBG_USTORM_ID, + DBG_XSTORM_ID, + DBG_YSTORM_ID, + DBG_PSTORM_ID, + MAX_DBG_STORMS +}; + +/* Idle Check data */ +struct idle_chk_data { + __le32 buf_size; /* Idle check buffer size in dwords */ + u8 buf_size_set; /* Indicates if the idle check buffer size was set + * (0/1). + */ + u8 reserved1; + __le16 reserved2; +}; + +/* Debug Tools data (per HW function) */ +struct dbg_tools_data { + struct dbg_grc_data grc; /* GRC Dump data */ + struct dbg_bus_data bus; /* Debug Bus data */ + struct idle_chk_data idle_chk; /* Idle Check data */ + u8 mode_enable[40]; /* Indicates if a mode is enabled (0/1) */ + u8 block_in_reset[80]; /* Indicates if a block is in reset state (0/1). + */ + u8 chip_id; /* Chip ID (from enum chip_ids) */ + u8 platform_id; /* Platform ID (from enum platform_ids) */ + u8 initialized; /* Indicates if the data was initialized */ + u8 reserved; +}; + /********************************/ /* HSI Init Functions constants */ /********************************/ @@ -1948,15 +2341,50 @@ struct init_qm_vport_params { /* Max size in dwords of a zipped array */ #define MAX_ZIPPED_SIZE 8192 +struct fw_asserts_ram_section { + __le16 section_ram_line_offset; + __le16 section_ram_line_size; + u8 list_dword_offset; + u8 list_element_dword_size; + u8 list_num_elements; + u8 list_next_index_dword_offset; +}; + +struct fw_ver_num { + u8 major; /* Firmware major version number */ + u8 minor; /* Firmware minor version number */ + u8 rev; /* Firmware revision version number */ + u8 eng; /* Firmware engineering version number (for bootleg versions) */ +}; + +struct fw_ver_info { + __le16 tools_ver; /* Tools version number */ + u8 image_id; /* FW image ID (e.g. main) */ + u8 reserved1; + struct fw_ver_num num; /* FW version number */ + __le32 timestamp; /* FW Timestamp in unix time (sec. since 1970) */ + __le32 reserved2; +}; + +struct fw_info { + struct fw_ver_info ver; + struct fw_asserts_ram_section fw_asserts_section; +}; + +struct fw_info_location { + __le32 grc_addr; + __le32 size; +}; + enum init_modes { MODE_RESERVED, MODE_BB_B0, - MODE_RESERVED2, + MODE_K2, MODE_ASIC, + MODE_RESERVED2, MODE_RESERVED3, MODE_RESERVED4, MODE_RESERVED5, - MODE_RESERVED6, MODE_SF, MODE_MF_SD, MODE_MF_SI, @@ -1965,7 +2393,7 @@ enum init_modes { MODE_PORTS_PER_ENG_4, MODE_100G, MODE_40G, - MODE_RESERVED7, + MODE_RESERVED6, MAX_INIT_MODES }; @@ -2223,8 +2651,276 @@ struct iro { __le16 size; }; +/***************************** Public Functions *******************************/ +/** + * @brief qed_dbg_set_bin_ptr - Sets a pointer to the binary data with debug + * arrays. + * + * @param bin_ptr - a pointer to the binary data with debug arrays. + */ +enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr); +/** + * @brief qed_dbg_grc_get_dump_buf_size - Returns the required buffer size for + * GRC Dump. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for the GRC Dump + * data. + * + * @return error if one of the following holds: + * - the version wasn't set + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_grc_dump - Dumps GRC data into the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the collected GRC data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified dump buffer is too small + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); +/** + * @brief qed_dbg_idle_chk_get_dump_buf_size - Returns the required buffer size + * for idle check results. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for the idle check + * data. + * + * @return error if one of the following holds: + * - the version wasn't set + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_idle_chk_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_idle_chk_dump - Performs idle check and writes the results + * into the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the idle check data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified buffer is too small + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); +/** + * @brief qed_dbg_mcp_trace_get_dump_buf_size - Returns the required buffer size + * for mcp trace results. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for mcp trace data. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the trace data in MCP scratchpad contain an invalid signature + * - the bundle ID in NVRAM is invalid + * - the trace meta data cannot be found (in NVRAM or image file) + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_mcp_trace_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_mcp_trace_dump - Performs mcp trace and writes the results + * into the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the mcp trace data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified buffer is too small + * - the trace data in MCP scratchpad contain an invalid signature + * - the bundle ID in NVRAM is invalid + * - the trace meta data cannot be found (in NVRAM or image file) + * - the trace meta data cannot be read (from NVRAM or image file) + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); +/** + * @brief qed_dbg_reg_fifo_get_dump_buf_size - Returns the required buffer size + * for grc trace fifo results. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for reg fifo data. + * + * @return error if one of the following holds: + * - the version wasn't set + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_reg_fifo_dump - Reads the reg fifo and writes the results into + * the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the reg fifo data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified buffer is too small + * - DMAE transaction failed + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); +/** + * @brief qed_dbg_igu_fifo_get_dump_buf_size - Returns the required buffer size + * for the IGU fifo results. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for the IGU fifo + * data. + * + * @return error if one of the following holds: + * - the version wasn't set + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_igu_fifo_dump - Reads the IGU fifo and writes the results into + * the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the IGU fifo data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified buffer is too small + * - DMAE transaction failed + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); +/** + * @brief qed_dbg_protection_override_get_dump_buf_size - Returns the required + * buffer size for protection override window results. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for protection + * override data. + * + * @return error if one of the following holds: + * - the version wasn't set + * Otherwise, returns ok. + */ +enum dbg_status +qed_dbg_protection_override_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_protection_override_dump - Reads protection override window + * entries and writes the results into the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the protection override data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified buffer is too small + * - DMAE transaction failed + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); +/** + * @brief qed_dbg_fw_asserts_get_dump_buf_size - Returns the required buffer + * size for FW Asserts results. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param buf_size - OUT: required buffer size (in dwords) for FW Asserts data. + * + * @return error if one of the following holds: + * - the version wasn't set + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *buf_size); +/** + * @brief qed_dbg_fw_asserts_dump - Reads the FW Asserts and writes the results + * into the specified buffer. + * + * @param p_hwfn - HW device data + * @param p_ptt - Ptt window used for writing the registers. + * @param dump_buf - Pointer to write the FW Asserts data into. + * @param buf_size_in_dwords - Size of the specified buffer in dwords. + * @param num_dumped_dwords - OUT: number of dumped dwords. + * + * @return error if one of the following holds: + * - the version wasn't set + * - the specified buffer is too small + * Otherwise, returns ok. + */ +enum dbg_status qed_dbg_fw_asserts_dump(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + u32 *dump_buf, + u32 buf_size_in_dwords, + u32 *num_dumped_dwords); /** - * @brief qed_dbg_print_attn - Prints attention registers values in the specified results struct. + * @brief qed_dbg_print_attn - Prints attention registers values in the + * specified results struct. * * @param p_hwfn * @param results - Pointer to the attention read results @@ -2236,8 +2932,212 @@ struct iro { enum dbg_status qed_dbg_print_attn(struct qed_hwfn *p_hwfn, struct dbg_attn_block_result *results); +/******************************** Constants **********************************/ + #define MAX_NAME_LEN 16 +/***************************** Public Functions *******************************/ +/** + * @brief qed_dbg_user_set_bin_ptr - Sets a pointer to the binary data with + * debug arrays. + * + * @param bin_ptr - a pointer to the binary data with debug arrays. + */ +enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr); +/** + * @brief qed_dbg_get_status_str - Returns a string for the specified status. + * + * @param status - a debug status code. + * + * @return a string for the specified status + */ +const char *qed_dbg_get_status_str(enum dbg_status status); +/** + * @brief qed_get_idle_chk_results_buf_size - Returns the required buffer size + * for idle check results (in bytes). + * + * @param p_hwfn - HW device data + * @param dump_buf - idle check dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed + * results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_get_idle_chk_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +/** + * @brief qed_print_idle_chk_results - Prints idle check results + * + * @param p_hwfn - HW device data + * @param dump_buf - idle check dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf - buffer for printing the idle check results. + * @param num_errors - OUT: number of errors found in idle check. + * @param num_warnings - OUT: number of warnings found in idle check. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf, + u32 *num_errors, + u32 *num_warnings); +/** + * @brief qed_get_mcp_trace_results_buf_size - Returns the required buffer size + * for MCP Trace results (in bytes). + * + * @param p_hwfn - HW device data + * @param dump_buf - MCP Trace dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed + * results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +/** + * @brief qed_print_mcp_trace_results - Prints MCP Trace results + * + * @param p_hwfn - HW device data + * @param dump_buf - mcp trace dump buffer, starting from the header. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf - buffer for printing the mcp trace results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf); +/** + * @brief qed_get_reg_fifo_results_buf_size - Returns the required buffer size + * for reg_fifo results (in bytes). + * + * @param p_hwfn - HW device data + * @param dump_buf - reg fifo dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed + * results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +/** + * @brief qed_print_reg_fifo_results - Prints reg fifo results + * + * @param p_hwfn - HW device data + * @param dump_buf - reg fifo dump buffer, starting from the header. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf - buffer for printing the reg fifo results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_print_reg_fifo_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf); +/** + * @brief qed_get_igu_fifo_results_buf_size - Returns the required buffer size + * for igu_fifo results (in bytes). + * + * @param p_hwfn - HW device data + * @param dump_buf - IGU fifo dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed + * results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_get_igu_fifo_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +/** + * @brief qed_print_igu_fifo_results - Prints IGU fifo results + * + * @param p_hwfn - HW device data + * @param dump_buf - IGU fifo dump buffer, starting from the header. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf - buffer for printing the IGU fifo results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_print_igu_fifo_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf); +/** + * @brief qed_get_protection_override_results_buf_size - Returns the required + * buffer size for protection override results (in bytes). + * + * @param p_hwfn - HW device data + * @param dump_buf - protection override dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed + * results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status +qed_get_protection_override_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +/** + * @brief qed_print_protection_override_results - Prints protection override + * results. + * + * @param p_hwfn - HW device data + * @param dump_buf - protection override dump buffer, starting from the header. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf - buffer for printing the reg fifo results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_print_protection_override_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf); +/** + * @brief qed_get_fw_asserts_results_buf_size - Returns the required buffer size + * for FW Asserts results (in bytes). + * + * @param p_hwfn - HW device data + * @param dump_buf - FW Asserts dump buffer. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed + * results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_get_fw_asserts_results_buf_size(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + u32 *results_buf_size); +/** + * @brief qed_print_fw_asserts_results - Prints FW Asserts results + * + * @param p_hwfn - HW device data + * @param dump_buf - FW Asserts dump buffer, starting from the header. + * @param num_dumped_dwords - number of dwords that were dumped. + * @param results_buf - buffer for printing the FW Asserts results. + * + * @return error if the parsing fails, ok otherwise. + */ +enum dbg_status qed_print_fw_asserts_results(struct qed_hwfn *p_hwfn, + u32 *dump_buf, + u32 num_dumped_dwords, + char *results_buf); /* Win 2 */ #define GTT_BAR0_MAP_REG_IGU_CMD 0x00f000UL @@ -7039,6 +7939,35 @@ struct ystorm_iscsi_conn_ag_ctx { __le32 reg2; __le32 reg3; }; + +#define MFW_TRACE_SIGNATURE 0x25071946 + +/* The trace in the buffer */ +#define MFW_TRACE_EVENTID_MASK 0x00ffff +#define MFW_TRACE_PRM_SIZE_MASK 0x0f0000 +#define MFW_TRACE_PRM_SIZE_SHIFT 16 +#define MFW_TRACE_ENTRY_SIZE 3 + +struct mcp_trace { + u32 signature; /* Help to identify that the trace is valid */ + u32 size; /* the size of the trace buffer in bytes */ + u32 curr_level; /* 2 - all will be written to the buffer + * 1 - debug trace will not be written + * 0 - just errors will be written to the buffer + */ + u32 modules_mask[2]; /* a bit per module, 1 means write it, 0 means + * mask it. + */ + + /* Warning: the following pointers are assumed to be 32bits as they are + * used only in the MFW. + */ + u32 trace_prod; /* The next trace will be written to this offset */ + u32 trace_oldest; /* The oldest valid trace starts at this offset + * (usually very close after the current producer). + */ +}; + #define VF_MAX_STATIC 192 #define MCP_GLOB_PATH_MAX 2 @@ -7046,6 +7975,7 @@ struct ystorm_iscsi_conn_ag_ctx { #define MCP_GLOB_PORT_MAX 4 #define MCP_GLOB_FUNC_MAX 16 +typedef u32 offsize_t; /* In DWORDS !!! */ /* Offset from the beginning of the MCP scratchpad */ #define OFFSIZE_OFFSET_SHIFT 0 #define OFFSIZE_OFFSET_MASK 0x0000ffff @@ -7636,6 +8566,8 @@ struct public_drv_mb { #define DRV_MSG_CODE_NIG_DRAIN 0x30000000 #define DRV_MSG_CODE_VF_DISABLED_DONE 0xc0000000 #define DRV_MSG_CODE_CFG_VF_MSIX 0xc0010000 +#define DRV_MSG_CODE_NVM_GET_FILE_ATT 0x00030000 +#define DRV_MSG_CODE_NVM_READ_NVRAM 0x00050000 #define DRV_MSG_CODE_MCP_RESET 0x00090000 #define DRV_MSG_CODE_SET_VERSION 0x000f0000 #define DRV_MSG_CODE_MCP_HALT 0x00100000 @@ -7657,6 +8589,9 @@ struct public_drv_mb { #define DRV_MB_PARAM_UNLOAD_WOL_MCP 0x00000001 #define DRV_MB_PARAM_DCBX_NOTIFY_MASK 0x000000FF #define DRV_MB_PARAM_DCBX_NOTIFY_SHIFT 3 + +#define DRV_MB_PARAM_NVM_LEN_SHIFT 24 + #define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT 0 #define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_MASK 0x000000FF #define DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT 8 @@ -7694,6 +8629,8 @@ struct public_drv_mb { #define FW_MSG_CODE_DRV_UNLOAD_FUNCTION 0x20130000 #define FW_MSG_CODE_DRV_UNLOAD_DONE 0x21100000 #define FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE 0xb0010000 + +#define FW_MSG_CODE_NVM_OK 0x00010000 #define FW_MSG_CODE_OK 0x00160000 #define FW_MSG_SEQ_NUMBER_MASK 0x0000ffff @@ -7930,4 +8867,101 @@ struct nvm_cfg1 { struct nvm_cfg1_port port[MCP_GLOB_PORT_MAX]; struct nvm_cfg1_func func[MCP_GLOB_FUNC_MAX]; }; + +enum spad_sections { + SPAD_SECTION_TRACE, + SPAD_SECTION_NVM_CFG, + SPAD_SECTION_PUBLIC, + SPAD_SECTION_PRIVATE, + SPAD_SECTION_MAX +}; + +#define MCP_TRACE_SIZE 2048 /* 2kb */ + +/* This section is located at a fixed location in the beginning of the + * scratchpad, to ensure that the MCP trace is not run over during MFW upgrade. + * All the rest of data has a floating location which differs from version to + * version, and is pointed by the mcp_meta_data below. + * Moreover, the spad_layout section is part of the MFW firmware, and is loaded + * with it from nvram in order to clear this portion. + */ +struct static_init { + u32 num_sections; + offsize_t sections[SPAD_SECTION_MAX]; +#define SECTION(_sec_) (*((offsize_t *)(STRUCT_OFFSET(sections[_sec_])))) + + struct mcp_trace trace; +#define MCP_TRACE_P ((struct mcp_trace *)(STRUCT_OFFSET(trace))) + u8 trace_buffer[MCP_TRACE_SIZE]; +#define MCP_TRACE_BUF ((u8 *)(STRUCT_OFFSET(trace_buffer))) + /* running_mfw has the same definition as in nvm_map.h. + * This bit indicate both the running dir, and the running bundle. + * It is set once when the LIM is loaded. + */ + u32 running_mfw; +#define RUNNING_MFW (*((u32 *)(STRUCT_OFFSET(running_mfw)))) + u32 build_time; +#define MFW_BUILD_TIME (*((u32 *)(STRUCT_OFFSET(build_time)))) + u32 reset_type; +#define RESET_TYPE (*((u32 *)(STRUCT_OFFSET(reset_type)))) + u32 mfw_secure_mode; +#define MFW_SECURE_MODE (*((u32 *)(STRUCT_OFFSET(mfw_secure_mode)))) + u16 pme_status_pf_bitmap; +#define PME_STATUS_PF_BITMAP (*((u16 *)(STRUCT_OFFSET(pme_status_pf_bitmap)))) + u16 pme_enable_pf_bitmap; +#define PME_ENABLE_PF_BITMAP (*((u16 *)(STRUCT_OFFSET(pme_enable_pf_bitmap)))) + u32 mim_nvm_addr; + u32 mim_start_addr; + u32 ah_pcie_link_params; +#define AH_PCIE_LINK_PARAMS_LINK_SPEED_MASK (0x000000ff) +#define AH_PCIE_LINK_PARAMS_LINK_SPEED_SHIFT (0) +#define AH_PCIE_LINK_PARAMS_LINK_WIDTH_MASK (0x0000ff00) +#define AH_PCIE_LINK_PARAMS_LINK_WIDTH_SHIFT (8) +#define AH_PCIE_LINK_PARAMS_ASPM_MODE_MASK (0x00ff0000) +#define AH_PCIE_LINK_PARAMS_ASPM_MODE_SHIFT (16) +#define AH_PCIE_LINK_PARAMS_ASPM_CAP_MASK (0xff000000) +#define AH_PCIE_LINK_PARAMS_ASPM_CAP_SHIFT (24) +#define AH_PCIE_LINK_PARAMS (*((u32 *)(STRUCT_OFFSET(ah_pcie_link_params)))) + + u32 rsrv_persist[5]; /* Persist reserved for MFW upgrades */ +}; + +enum nvm_image_type { + NVM_TYPE_TIM1 = 0x01, + NVM_TYPE_TIM2 = 0x02, + NVM_TYPE_MIM1 = 0x03, + NVM_TYPE_MIM2 = 0x04, + NVM_TYPE_MBA = 0x05, + NVM_TYPE_MODULES_PN = 0x06, + NVM_TYPE_VPD = 0x07, + NVM_TYPE_MFW_TRACE1 = 0x08, + NVM_TYPE_MFW_TRACE2 = 0x09, + NVM_TYPE_NVM_CFG1 = 0x0a, + NVM_TYPE_L2B = 0x0b, + NVM_TYPE_DIR1 = 0x0c, + NVM_TYPE_EAGLE_FW1 = 0x0d, + NVM_TYPE_FALCON_FW1 = 0x0e, + NVM_TYPE_PCIE_FW1 = 0x0f, + NVM_TYPE_HW_SET = 0x10, + NVM_TYPE_LIM = 0x11, + NVM_TYPE_AVS_FW1 = 0x12, + NVM_TYPE_DIR2 = 0x13, + NVM_TYPE_CCM = 0x14, + NVM_TYPE_EAGLE_FW2 = 0x15, + NVM_TYPE_FALCON_FW2 = 0x16, + NVM_TYPE_PCIE_FW2 = 0x17, + NVM_TYPE_AVS_FW2 = 0x18, + NVM_TYPE_INIT_HW = 0x19, + NVM_TYPE_DEFAULT_CFG = 0x1a, + NVM_TYPE_MDUMP = 0x1b, + NVM_TYPE_META = 0x1c, + NVM_TYPE_ISCSI_CFG = 0x1d, + NVM_TYPE_FCOE_CFG = 0x1f, + NVM_TYPE_ETH_PHY_FW1 = 0x20, + NVM_TYPE_ETH_PHY_FW2 = 0x21, + NVM_TYPE_MAX, +}; + +#define DIR_ID_1 (0) + #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index d22e3f8816ae..250efd1d270d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -588,6 +588,8 @@ static int qed_nic_stop(struct qed_dev *cdev) } } + qed_dbg_pf_exit(cdev); + return rc; } @@ -846,6 +848,8 @@ static int qed_slowpath_start(struct qed_dev *cdev, /* First Dword used to diffrentiate between various sources */ data = cdev->firmware->data + sizeof(u32); + + qed_dbg_pf_init(cdev); } memset(&tunn_info, 0, sizeof(tunn_info)); diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h index b44f09b18eb1..759cb04e02b0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h +++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h @@ -528,9 +528,903 @@ #define QM_REG_WFQPFWEIGHT 0x2f4e80UL #define QM_REG_WFQVPWEIGHT 0x2fa000UL +#define PGLCS_REG_DBG_SELECT \ + 0x001d14UL +#define PGLCS_REG_DBG_DWORD_ENABLE \ + 0x001d18UL +#define PGLCS_REG_DBG_SHIFT \ + 0x001d1cUL +#define PGLCS_REG_DBG_FORCE_VALID \ + 0x001d20UL +#define PGLCS_REG_DBG_FORCE_FRAME \ + 0x001d24UL +#define MISC_REG_RESET_PL_PDA_VMAIN_1 \ + 0x008070UL +#define MISC_REG_RESET_PL_PDA_VMAIN_2 \ + 0x008080UL +#define MISC_REG_RESET_PL_PDA_VAUX \ + 0x008090UL +#define MISCS_REG_RESET_PL_UA \ + 0x009050UL +#define MISCS_REG_RESET_PL_HV \ + 0x009060UL +#define MISCS_REG_RESET_PL_HV_2 \ + 0x009150UL +#define DMAE_REG_DBG_SELECT \ + 0x00c510UL +#define DMAE_REG_DBG_DWORD_ENABLE \ + 0x00c514UL +#define DMAE_REG_DBG_SHIFT \ + 0x00c518UL +#define DMAE_REG_DBG_FORCE_VALID \ + 0x00c51cUL +#define DMAE_REG_DBG_FORCE_FRAME \ + 0x00c520UL +#define NCSI_REG_DBG_SELECT \ + 0x040474UL +#define NCSI_REG_DBG_DWORD_ENABLE \ + 0x040478UL +#define NCSI_REG_DBG_SHIFT \ + 0x04047cUL +#define NCSI_REG_DBG_FORCE_VALID \ + 0x040480UL +#define NCSI_REG_DBG_FORCE_FRAME \ + 0x040484UL +#define GRC_REG_DBG_SELECT \ + 0x0500a4UL +#define GRC_REG_DBG_DWORD_ENABLE \ + 0x0500a8UL +#define GRC_REG_DBG_SHIFT \ + 0x0500acUL +#define GRC_REG_DBG_FORCE_VALID \ + 0x0500b0UL +#define GRC_REG_DBG_FORCE_FRAME \ + 0x0500b4UL +#define UMAC_REG_DBG_SELECT \ + 0x051094UL +#define UMAC_REG_DBG_DWORD_ENABLE \ + 0x051098UL +#define UMAC_REG_DBG_SHIFT \ + 0x05109cUL +#define UMAC_REG_DBG_FORCE_VALID \ + 0x0510a0UL +#define UMAC_REG_DBG_FORCE_FRAME \ + 0x0510a4UL +#define MCP2_REG_DBG_SELECT \ + 0x052400UL +#define MCP2_REG_DBG_DWORD_ENABLE \ + 0x052404UL +#define MCP2_REG_DBG_SHIFT \ + 0x052408UL +#define MCP2_REG_DBG_FORCE_VALID \ + 0x052440UL +#define MCP2_REG_DBG_FORCE_FRAME \ + 0x052444UL +#define PCIE_REG_DBG_SELECT \ + 0x0547e8UL +#define PCIE_REG_DBG_DWORD_ENABLE \ + 0x0547ecUL +#define PCIE_REG_DBG_SHIFT \ + 0x0547f0UL +#define PCIE_REG_DBG_FORCE_VALID \ + 0x0547f4UL +#define PCIE_REG_DBG_FORCE_FRAME \ + 0x0547f8UL +#define DORQ_REG_DBG_SELECT \ + 0x100ad0UL +#define DORQ_REG_DBG_DWORD_ENABLE \ + 0x100ad4UL +#define DORQ_REG_DBG_SHIFT \ + 0x100ad8UL +#define DORQ_REG_DBG_FORCE_VALID \ + 0x100adcUL +#define DORQ_REG_DBG_FORCE_FRAME \ + 0x100ae0UL +#define IGU_REG_DBG_SELECT \ + 0x181578UL +#define IGU_REG_DBG_DWORD_ENABLE \ + 0x18157cUL +#define IGU_REG_DBG_SHIFT \ + 0x181580UL +#define IGU_REG_DBG_FORCE_VALID \ + 0x181584UL +#define IGU_REG_DBG_FORCE_FRAME \ + 0x181588UL +#define CAU_REG_DBG_SELECT \ + 0x1c0ea8UL +#define CAU_REG_DBG_DWORD_ENABLE \ + 0x1c0eacUL +#define CAU_REG_DBG_SHIFT \ + 0x1c0eb0UL +#define CAU_REG_DBG_FORCE_VALID \ + 0x1c0eb4UL +#define CAU_REG_DBG_FORCE_FRAME \ + 0x1c0eb8UL +#define PRS_REG_DBG_SELECT \ + 0x1f0b6cUL +#define PRS_REG_DBG_DWORD_ENABLE \ + 0x1f0b70UL +#define PRS_REG_DBG_SHIFT \ + 0x1f0b74UL +#define PRS_REG_DBG_FORCE_VALID \ + 0x1f0ba0UL +#define PRS_REG_DBG_FORCE_FRAME \ + 0x1f0ba4UL +#define CNIG_REG_DBG_SELECT_K2 \ + 0x218254UL +#define CNIG_REG_DBG_DWORD_ENABLE_K2 \ + 0x218258UL +#define CNIG_REG_DBG_SHIFT_K2 \ + 0x21825cUL +#define CNIG_REG_DBG_FORCE_VALID_K2 \ + 0x218260UL +#define CNIG_REG_DBG_FORCE_FRAME_K2 \ + 0x218264UL +#define PRM_REG_DBG_SELECT \ + 0x2306a8UL +#define PRM_REG_DBG_DWORD_ENABLE \ + 0x2306acUL +#define PRM_REG_DBG_SHIFT \ + 0x2306b0UL +#define PRM_REG_DBG_FORCE_VALID \ + 0x2306b4UL +#define PRM_REG_DBG_FORCE_FRAME \ + 0x2306b8UL +#define SRC_REG_DBG_SELECT \ + 0x238700UL +#define SRC_REG_DBG_DWORD_ENABLE \ + 0x238704UL +#define SRC_REG_DBG_SHIFT \ + 0x238708UL +#define SRC_REG_DBG_FORCE_VALID \ + 0x23870cUL +#define SRC_REG_DBG_FORCE_FRAME \ + 0x238710UL +#define RSS_REG_DBG_SELECT \ + 0x238c4cUL +#define RSS_REG_DBG_DWORD_ENABLE \ + 0x238c50UL +#define RSS_REG_DBG_SHIFT \ + 0x238c54UL +#define RSS_REG_DBG_FORCE_VALID \ + 0x238c58UL +#define RSS_REG_DBG_FORCE_FRAME \ + 0x238c5cUL +#define RPB_REG_DBG_SELECT \ + 0x23c728UL +#define RPB_REG_DBG_DWORD_ENABLE \ + 0x23c72cUL +#define RPB_REG_DBG_SHIFT \ + 0x23c730UL +#define RPB_REG_DBG_FORCE_VALID \ + 0x23c734UL +#define RPB_REG_DBG_FORCE_FRAME \ + 0x23c738UL +#define PSWRQ2_REG_DBG_SELECT \ + 0x240100UL +#define PSWRQ2_REG_DBG_DWORD_ENABLE \ + 0x240104UL +#define PSWRQ2_REG_DBG_SHIFT \ + 0x240108UL +#define PSWRQ2_REG_DBG_FORCE_VALID \ + 0x24010cUL +#define PSWRQ2_REG_DBG_FORCE_FRAME \ + 0x240110UL +#define PSWRQ_REG_DBG_SELECT \ + 0x280020UL +#define PSWRQ_REG_DBG_DWORD_ENABLE \ + 0x280024UL +#define PSWRQ_REG_DBG_SHIFT \ + 0x280028UL +#define PSWRQ_REG_DBG_FORCE_VALID \ + 0x28002cUL +#define PSWRQ_REG_DBG_FORCE_FRAME \ + 0x280030UL +#define PSWWR_REG_DBG_SELECT \ + 0x29a084UL +#define PSWWR_REG_DBG_DWORD_ENABLE \ + 0x29a088UL +#define PSWWR_REG_DBG_SHIFT \ + 0x29a08cUL +#define PSWWR_REG_DBG_FORCE_VALID \ + 0x29a090UL +#define PSWWR_REG_DBG_FORCE_FRAME \ + 0x29a094UL +#define PSWRD_REG_DBG_SELECT \ + 0x29c040UL +#define PSWRD_REG_DBG_DWORD_ENABLE \ + 0x29c044UL +#define PSWRD_REG_DBG_SHIFT \ + 0x29c048UL +#define PSWRD_REG_DBG_FORCE_VALID \ + 0x29c04cUL +#define PSWRD_REG_DBG_FORCE_FRAME \ + 0x29c050UL +#define PSWRD2_REG_DBG_SELECT \ + 0x29d400UL +#define PSWRD2_REG_DBG_DWORD_ENABLE \ + 0x29d404UL +#define PSWRD2_REG_DBG_SHIFT \ + 0x29d408UL +#define PSWRD2_REG_DBG_FORCE_VALID \ + 0x29d40cUL +#define PSWRD2_REG_DBG_FORCE_FRAME \ + 0x29d410UL +#define PSWHST2_REG_DBG_SELECT \ + 0x29e058UL +#define PSWHST2_REG_DBG_DWORD_ENABLE \ + 0x29e05cUL +#define PSWHST2_REG_DBG_SHIFT \ + 0x29e060UL +#define PSWHST2_REG_DBG_FORCE_VALID \ + 0x29e064UL +#define PSWHST2_REG_DBG_FORCE_FRAME \ + 0x29e068UL +#define PSWHST_REG_DBG_SELECT \ + 0x2a0100UL +#define PSWHST_REG_DBG_DWORD_ENABLE \ + 0x2a0104UL +#define PSWHST_REG_DBG_SHIFT \ + 0x2a0108UL +#define PSWHST_REG_DBG_FORCE_VALID \ + 0x2a010cUL +#define PSWHST_REG_DBG_FORCE_FRAME \ + 0x2a0110UL +#define PGLUE_B_REG_DBG_SELECT \ + 0x2a8400UL +#define PGLUE_B_REG_DBG_DWORD_ENABLE \ + 0x2a8404UL +#define PGLUE_B_REG_DBG_SHIFT \ + 0x2a8408UL +#define PGLUE_B_REG_DBG_FORCE_VALID \ + 0x2a840cUL +#define PGLUE_B_REG_DBG_FORCE_FRAME \ + 0x2a8410UL +#define TM_REG_DBG_SELECT \ + 0x2c07a8UL +#define TM_REG_DBG_DWORD_ENABLE \ + 0x2c07acUL +#define TM_REG_DBG_SHIFT \ + 0x2c07b0UL +#define TM_REG_DBG_FORCE_VALID \ + 0x2c07b4UL +#define TM_REG_DBG_FORCE_FRAME \ + 0x2c07b8UL +#define TCFC_REG_DBG_SELECT \ + 0x2d0500UL +#define TCFC_REG_DBG_DWORD_ENABLE \ + 0x2d0504UL +#define TCFC_REG_DBG_SHIFT \ + 0x2d0508UL +#define TCFC_REG_DBG_FORCE_VALID \ + 0x2d050cUL +#define TCFC_REG_DBG_FORCE_FRAME \ + 0x2d0510UL +#define CCFC_REG_DBG_SELECT \ + 0x2e0500UL +#define CCFC_REG_DBG_DWORD_ENABLE \ + 0x2e0504UL +#define CCFC_REG_DBG_SHIFT \ + 0x2e0508UL +#define CCFC_REG_DBG_FORCE_VALID \ + 0x2e050cUL +#define CCFC_REG_DBG_FORCE_FRAME \ + 0x2e0510UL +#define QM_REG_DBG_SELECT \ + 0x2f2e74UL +#define QM_REG_DBG_DWORD_ENABLE \ + 0x2f2e78UL +#define QM_REG_DBG_SHIFT \ + 0x2f2e7cUL +#define QM_REG_DBG_FORCE_VALID \ + 0x2f2e80UL +#define QM_REG_DBG_FORCE_FRAME \ + 0x2f2e84UL +#define RDIF_REG_DBG_SELECT \ + 0x300500UL +#define RDIF_REG_DBG_DWORD_ENABLE \ + 0x300504UL +#define RDIF_REG_DBG_SHIFT \ + 0x300508UL +#define RDIF_REG_DBG_FORCE_VALID \ + 0x30050cUL +#define RDIF_REG_DBG_FORCE_FRAME \ + 0x300510UL +#define TDIF_REG_DBG_SELECT \ + 0x310500UL +#define TDIF_REG_DBG_DWORD_ENABLE \ + 0x310504UL +#define TDIF_REG_DBG_SHIFT \ + 0x310508UL +#define TDIF_REG_DBG_FORCE_VALID \ + 0x31050cUL +#define TDIF_REG_DBG_FORCE_FRAME \ + 0x310510UL +#define BRB_REG_DBG_SELECT \ + 0x340ed0UL +#define BRB_REG_DBG_DWORD_ENABLE \ + 0x340ed4UL +#define BRB_REG_DBG_SHIFT \ + 0x340ed8UL +#define BRB_REG_DBG_FORCE_VALID \ + 0x340edcUL +#define BRB_REG_DBG_FORCE_FRAME \ + 0x340ee0UL +#define XYLD_REG_DBG_SELECT \ + 0x4c1600UL +#define XYLD_REG_DBG_DWORD_ENABLE \ + 0x4c1604UL +#define XYLD_REG_DBG_SHIFT \ + 0x4c1608UL +#define XYLD_REG_DBG_FORCE_VALID \ + 0x4c160cUL +#define XYLD_REG_DBG_FORCE_FRAME \ + 0x4c1610UL +#define YULD_REG_DBG_SELECT \ + 0x4c9600UL +#define YULD_REG_DBG_DWORD_ENABLE \ + 0x4c9604UL +#define YULD_REG_DBG_SHIFT \ + 0x4c9608UL +#define YULD_REG_DBG_FORCE_VALID \ + 0x4c960cUL +#define YULD_REG_DBG_FORCE_FRAME \ + 0x4c9610UL +#define TMLD_REG_DBG_SELECT \ + 0x4d1600UL +#define TMLD_REG_DBG_DWORD_ENABLE \ + 0x4d1604UL +#define TMLD_REG_DBG_SHIFT \ + 0x4d1608UL +#define TMLD_REG_DBG_FORCE_VALID \ + 0x4d160cUL +#define TMLD_REG_DBG_FORCE_FRAME \ + 0x4d1610UL +#define MULD_REG_DBG_SELECT \ + 0x4e1600UL +#define MULD_REG_DBG_DWORD_ENABLE \ + 0x4e1604UL +#define MULD_REG_DBG_SHIFT \ + 0x4e1608UL +#define MULD_REG_DBG_FORCE_VALID \ + 0x4e160cUL +#define MULD_REG_DBG_FORCE_FRAME \ + 0x4e1610UL +#define NIG_REG_DBG_SELECT \ + 0x502140UL +#define NIG_REG_DBG_DWORD_ENABLE \ + 0x502144UL +#define NIG_REG_DBG_SHIFT \ + 0x502148UL +#define NIG_REG_DBG_FORCE_VALID \ + 0x50214cUL +#define NIG_REG_DBG_FORCE_FRAME \ + 0x502150UL +#define BMB_REG_DBG_SELECT \ + 0x540a7cUL +#define BMB_REG_DBG_DWORD_ENABLE \ + 0x540a80UL +#define BMB_REG_DBG_SHIFT \ + 0x540a84UL +#define BMB_REG_DBG_FORCE_VALID \ + 0x540a88UL +#define BMB_REG_DBG_FORCE_FRAME \ + 0x540a8cUL +#define PTU_REG_DBG_SELECT \ + 0x560100UL +#define PTU_REG_DBG_DWORD_ENABLE \ + 0x560104UL +#define PTU_REG_DBG_SHIFT \ + 0x560108UL +#define PTU_REG_DBG_FORCE_VALID \ + 0x56010cUL +#define PTU_REG_DBG_FORCE_FRAME \ + 0x560110UL +#define CDU_REG_DBG_SELECT \ + 0x580704UL +#define CDU_REG_DBG_DWORD_ENABLE \ + 0x580708UL +#define CDU_REG_DBG_SHIFT \ + 0x58070cUL +#define CDU_REG_DBG_FORCE_VALID \ + 0x580710UL +#define CDU_REG_DBG_FORCE_FRAME \ + 0x580714UL +#define WOL_REG_DBG_SELECT \ + 0x600140UL +#define WOL_REG_DBG_DWORD_ENABLE \ + 0x600144UL +#define WOL_REG_DBG_SHIFT \ + 0x600148UL +#define WOL_REG_DBG_FORCE_VALID \ + 0x60014cUL +#define WOL_REG_DBG_FORCE_FRAME \ + 0x600150UL +#define BMBN_REG_DBG_SELECT \ + 0x610140UL +#define BMBN_REG_DBG_DWORD_ENABLE \ + 0x610144UL +#define BMBN_REG_DBG_SHIFT \ + 0x610148UL +#define BMBN_REG_DBG_FORCE_VALID \ + 0x61014cUL +#define BMBN_REG_DBG_FORCE_FRAME \ + 0x610150UL +#define NWM_REG_DBG_SELECT \ + 0x8000ecUL +#define NWM_REG_DBG_DWORD_ENABLE \ + 0x8000f0UL +#define NWM_REG_DBG_SHIFT \ + 0x8000f4UL +#define NWM_REG_DBG_FORCE_VALID \ + 0x8000f8UL +#define NWM_REG_DBG_FORCE_FRAME \ + 0x8000fcUL +#define PBF_REG_DBG_SELECT \ + 0xd80060UL +#define PBF_REG_DBG_DWORD_ENABLE \ + 0xd80064UL +#define PBF_REG_DBG_SHIFT \ + 0xd80068UL +#define PBF_REG_DBG_FORCE_VALID \ + 0xd8006cUL +#define PBF_REG_DBG_FORCE_FRAME \ + 0xd80070UL +#define PBF_PB1_REG_DBG_SELECT \ + 0xda0728UL +#define PBF_PB1_REG_DBG_DWORD_ENABLE \ + 0xda072cUL +#define PBF_PB1_REG_DBG_SHIFT \ + 0xda0730UL +#define PBF_PB1_REG_DBG_FORCE_VALID \ + 0xda0734UL +#define PBF_PB1_REG_DBG_FORCE_FRAME \ + 0xda0738UL +#define PBF_PB2_REG_DBG_SELECT \ + 0xda4728UL +#define PBF_PB2_REG_DBG_DWORD_ENABLE \ + 0xda472cUL +#define PBF_PB2_REG_DBG_SHIFT \ + 0xda4730UL +#define PBF_PB2_REG_DBG_FORCE_VALID \ + 0xda4734UL +#define PBF_PB2_REG_DBG_FORCE_FRAME \ + 0xda4738UL +#define BTB_REG_DBG_SELECT \ + 0xdb08c8UL +#define BTB_REG_DBG_DWORD_ENABLE \ + 0xdb08ccUL +#define BTB_REG_DBG_SHIFT \ + 0xdb08d0UL +#define BTB_REG_DBG_FORCE_VALID \ + 0xdb08d4UL +#define BTB_REG_DBG_FORCE_FRAME \ + 0xdb08d8UL +#define XSDM_REG_DBG_SELECT \ + 0xf80e28UL +#define XSDM_REG_DBG_DWORD_ENABLE \ + 0xf80e2cUL +#define XSDM_REG_DBG_SHIFT \ + 0xf80e30UL +#define XSDM_REG_DBG_FORCE_VALID \ + 0xf80e34UL +#define XSDM_REG_DBG_FORCE_FRAME \ + 0xf80e38UL +#define YSDM_REG_DBG_SELECT \ + 0xf90e28UL +#define YSDM_REG_DBG_DWORD_ENABLE \ + 0xf90e2cUL +#define YSDM_REG_DBG_SHIFT \ + 0xf90e30UL +#define YSDM_REG_DBG_FORCE_VALID \ + 0xf90e34UL +#define YSDM_REG_DBG_FORCE_FRAME \ + 0xf90e38UL +#define PSDM_REG_DBG_SELECT \ + 0xfa0e28UL +#define PSDM_REG_DBG_DWORD_ENABLE \ + 0xfa0e2cUL +#define PSDM_REG_DBG_SHIFT \ + 0xfa0e30UL +#define PSDM_REG_DBG_FORCE_VALID \ + 0xfa0e34UL +#define PSDM_REG_DBG_FORCE_FRAME \ + 0xfa0e38UL +#define TSDM_REG_DBG_SELECT \ + 0xfb0e28UL +#define TSDM_REG_DBG_DWORD_ENABLE \ + 0xfb0e2cUL +#define TSDM_REG_DBG_SHIFT \ + 0xfb0e30UL +#define TSDM_REG_DBG_FORCE_VALID \ + 0xfb0e34UL +#define TSDM_REG_DBG_FORCE_FRAME \ + 0xfb0e38UL +#define MSDM_REG_DBG_SELECT \ + 0xfc0e28UL +#define MSDM_REG_DBG_DWORD_ENABLE \ + 0xfc0e2cUL +#define MSDM_REG_DBG_SHIFT \ + 0xfc0e30UL +#define MSDM_REG_DBG_FORCE_VALID \ + 0xfc0e34UL +#define MSDM_REG_DBG_FORCE_FRAME \ + 0xfc0e38UL +#define USDM_REG_DBG_SELECT \ + 0xfd0e28UL +#define USDM_REG_DBG_DWORD_ENABLE \ + 0xfd0e2cUL +#define USDM_REG_DBG_SHIFT \ + 0xfd0e30UL +#define USDM_REG_DBG_FORCE_VALID \ + 0xfd0e34UL +#define USDM_REG_DBG_FORCE_FRAME \ + 0xfd0e38UL +#define XCM_REG_DBG_SELECT \ + 0x1000040UL +#define XCM_REG_DBG_DWORD_ENABLE \ + 0x1000044UL +#define XCM_REG_DBG_SHIFT \ + 0x1000048UL +#define XCM_REG_DBG_FORCE_VALID \ + 0x100004cUL +#define XCM_REG_DBG_FORCE_FRAME \ + 0x1000050UL +#define YCM_REG_DBG_SELECT \ + 0x1080040UL +#define YCM_REG_DBG_DWORD_ENABLE \ + 0x1080044UL +#define YCM_REG_DBG_SHIFT \ + 0x1080048UL +#define YCM_REG_DBG_FORCE_VALID \ + 0x108004cUL +#define YCM_REG_DBG_FORCE_FRAME \ + 0x1080050UL +#define PCM_REG_DBG_SELECT \ + 0x1100040UL +#define PCM_REG_DBG_DWORD_ENABLE \ + 0x1100044UL +#define PCM_REG_DBG_SHIFT \ + 0x1100048UL +#define PCM_REG_DBG_FORCE_VALID \ + 0x110004cUL +#define PCM_REG_DBG_FORCE_FRAME \ + 0x1100050UL +#define TCM_REG_DBG_SELECT \ + 0x1180040UL +#define TCM_REG_DBG_DWORD_ENABLE \ + 0x1180044UL +#define TCM_REG_DBG_SHIFT \ + 0x1180048UL +#define TCM_REG_DBG_FORCE_VALID \ + 0x118004cUL +#define TCM_REG_DBG_FORCE_FRAME \ + 0x1180050UL +#define MCM_REG_DBG_SELECT \ + 0x1200040UL +#define MCM_REG_DBG_DWORD_ENABLE \ + 0x1200044UL +#define MCM_REG_DBG_SHIFT \ + 0x1200048UL +#define MCM_REG_DBG_FORCE_VALID \ + 0x120004cUL +#define MCM_REG_DBG_FORCE_FRAME \ + 0x1200050UL +#define UCM_REG_DBG_SELECT \ + 0x1280050UL +#define UCM_REG_DBG_DWORD_ENABLE \ + 0x1280054UL +#define UCM_REG_DBG_SHIFT \ + 0x1280058UL +#define UCM_REG_DBG_FORCE_VALID \ + 0x128005cUL +#define UCM_REG_DBG_FORCE_FRAME \ + 0x1280060UL +#define XSEM_REG_DBG_SELECT \ + 0x1401528UL +#define XSEM_REG_DBG_DWORD_ENABLE \ + 0x140152cUL +#define XSEM_REG_DBG_SHIFT \ + 0x1401530UL +#define XSEM_REG_DBG_FORCE_VALID \ + 0x1401534UL +#define XSEM_REG_DBG_FORCE_FRAME \ + 0x1401538UL +#define YSEM_REG_DBG_SELECT \ + 0x1501528UL +#define YSEM_REG_DBG_DWORD_ENABLE \ + 0x150152cUL +#define YSEM_REG_DBG_SHIFT \ + 0x1501530UL +#define YSEM_REG_DBG_FORCE_VALID \ + 0x1501534UL +#define YSEM_REG_DBG_FORCE_FRAME \ + 0x1501538UL +#define PSEM_REG_DBG_SELECT \ + 0x1601528UL +#define PSEM_REG_DBG_DWORD_ENABLE \ + 0x160152cUL +#define PSEM_REG_DBG_SHIFT \ + 0x1601530UL +#define PSEM_REG_DBG_FORCE_VALID \ + 0x1601534UL +#define PSEM_REG_DBG_FORCE_FRAME \ + 0x1601538UL +#define TSEM_REG_DBG_SELECT \ + 0x1701528UL +#define TSEM_REG_DBG_DWORD_ENABLE \ + 0x170152cUL +#define TSEM_REG_DBG_SHIFT \ + 0x1701530UL +#define TSEM_REG_DBG_FORCE_VALID \ + 0x1701534UL +#define TSEM_REG_DBG_FORCE_FRAME \ + 0x1701538UL +#define MSEM_REG_DBG_SELECT \ + 0x1801528UL +#define MSEM_REG_DBG_DWORD_ENABLE \ + 0x180152cUL +#define MSEM_REG_DBG_SHIFT \ + 0x1801530UL +#define MSEM_REG_DBG_FORCE_VALID \ + 0x1801534UL +#define MSEM_REG_DBG_FORCE_FRAME \ + 0x1801538UL +#define USEM_REG_DBG_SELECT \ + 0x1901528UL +#define USEM_REG_DBG_DWORD_ENABLE \ + 0x190152cUL +#define USEM_REG_DBG_SHIFT \ + 0x1901530UL +#define USEM_REG_DBG_FORCE_VALID \ + 0x1901534UL +#define USEM_REG_DBG_FORCE_FRAME \ + 0x1901538UL +#define PCIE_REG_DBG_COMMON_SELECT \ + 0x054398UL +#define PCIE_REG_DBG_COMMON_DWORD_ENABLE \ + 0x05439cUL +#define PCIE_REG_DBG_COMMON_SHIFT \ + 0x0543a0UL +#define PCIE_REG_DBG_COMMON_FORCE_VALID \ + 0x0543a4UL +#define PCIE_REG_DBG_COMMON_FORCE_FRAME \ + 0x0543a8UL +#define MISC_REG_RESET_PL_UA \ + 0x008050UL +#define MISC_REG_RESET_PL_HV \ + 0x008060UL +#define XCM_REG_CTX_RBC_ACCS \ + 0x1001800UL +#define XCM_REG_AGG_CON_CTX \ + 0x1001804UL +#define XCM_REG_SM_CON_CTX \ + 0x1001808UL +#define YCM_REG_CTX_RBC_ACCS \ + 0x1081800UL +#define YCM_REG_AGG_CON_CTX \ + 0x1081804UL +#define YCM_REG_AGG_TASK_CTX \ + 0x1081808UL +#define YCM_REG_SM_CON_CTX \ + 0x108180cUL +#define YCM_REG_SM_TASK_CTX \ + 0x1081810UL +#define PCM_REG_CTX_RBC_ACCS \ + 0x1101440UL +#define PCM_REG_SM_CON_CTX \ + 0x1101444UL +#define TCM_REG_CTX_RBC_ACCS \ + 0x11814c0UL +#define TCM_REG_AGG_CON_CTX \ + 0x11814c4UL +#define TCM_REG_AGG_TASK_CTX \ + 0x11814c8UL +#define TCM_REG_SM_CON_CTX \ + 0x11814ccUL +#define TCM_REG_SM_TASK_CTX \ + 0x11814d0UL +#define MCM_REG_CTX_RBC_ACCS \ + 0x1201800UL +#define MCM_REG_AGG_CON_CTX \ + 0x1201804UL +#define MCM_REG_AGG_TASK_CTX \ + 0x1201808UL +#define MCM_REG_SM_CON_CTX \ + 0x120180cUL +#define MCM_REG_SM_TASK_CTX \ + 0x1201810UL +#define UCM_REG_CTX_RBC_ACCS \ + 0x1281700UL +#define UCM_REG_AGG_CON_CTX \ + 0x1281704UL +#define UCM_REG_AGG_TASK_CTX \ + 0x1281708UL +#define UCM_REG_SM_CON_CTX \ + 0x128170cUL +#define UCM_REG_SM_TASK_CTX \ + 0x1281710UL +#define XSEM_REG_SLOW_DBG_EMPTY \ + 0x1401140UL +#define XSEM_REG_SYNC_DBG_EMPTY \ + 0x1401160UL +#define XSEM_REG_SLOW_DBG_ACTIVE \ + 0x1401400UL +#define XSEM_REG_SLOW_DBG_MODE \ + 0x1401404UL +#define XSEM_REG_DBG_FRAME_MODE \ + 0x1401408UL +#define XSEM_REG_DBG_MODE1_CFG \ + 0x1401420UL +#define XSEM_REG_FAST_MEMORY \ + 0x1440000UL +#define YSEM_REG_SYNC_DBG_EMPTY \ + 0x1501160UL +#define YSEM_REG_SLOW_DBG_ACTIVE \ + 0x1501400UL +#define YSEM_REG_SLOW_DBG_MODE \ + 0x1501404UL +#define YSEM_REG_DBG_FRAME_MODE \ + 0x1501408UL +#define YSEM_REG_DBG_MODE1_CFG \ + 0x1501420UL +#define YSEM_REG_FAST_MEMORY \ + 0x1540000UL +#define PSEM_REG_SLOW_DBG_EMPTY \ + 0x1601140UL +#define PSEM_REG_SYNC_DBG_EMPTY \ + 0x1601160UL +#define PSEM_REG_SLOW_DBG_ACTIVE \ + 0x1601400UL +#define PSEM_REG_SLOW_DBG_MODE \ + 0x1601404UL +#define PSEM_REG_DBG_FRAME_MODE \ + 0x1601408UL +#define PSEM_REG_DBG_MODE1_CFG \ + 0x1601420UL +#define PSEM_REG_FAST_MEMORY \ + 0x1640000UL +#define TSEM_REG_SLOW_DBG_EMPTY \ + 0x1701140UL +#define TSEM_REG_SYNC_DBG_EMPTY \ + 0x1701160UL +#define TSEM_REG_SLOW_DBG_ACTIVE \ + 0x1701400UL +#define TSEM_REG_SLOW_DBG_MODE \ + 0x1701404UL +#define TSEM_REG_DBG_FRAME_MODE \ + 0x1701408UL +#define TSEM_REG_DBG_MODE1_CFG \ + 0x1701420UL +#define TSEM_REG_FAST_MEMORY \ + 0x1740000UL +#define MSEM_REG_SLOW_DBG_EMPTY \ + 0x1801140UL +#define MSEM_REG_SYNC_DBG_EMPTY \ + 0x1801160UL +#define MSEM_REG_SLOW_DBG_ACTIVE \ + 0x1801400UL +#define MSEM_REG_SLOW_DBG_MODE \ + 0x1801404UL +#define MSEM_REG_DBG_FRAME_MODE \ + 0x1801408UL +#define MSEM_REG_DBG_MODE1_CFG \ + 0x1801420UL +#define MSEM_REG_FAST_MEMORY \ + 0x1840000UL +#define USEM_REG_SLOW_DBG_EMPTY \ + 0x1901140UL +#define USEM_REG_SYNC_DBG_EMPTY \ + 0x1901160UL +#define USEM_REG_SLOW_DBG_ACTIVE \ + 0x1901400UL +#define USEM_REG_SLOW_DBG_MODE \ + 0x1901404UL +#define USEM_REG_DBG_FRAME_MODE \ + 0x1901408UL +#define USEM_REG_DBG_MODE1_CFG \ + 0x1901420UL +#define USEM_REG_FAST_MEMORY \ + 0x1940000UL +#define SEM_FAST_REG_INT_RAM \ + 0x020000UL +#define SEM_FAST_REG_INT_RAM_SIZE \ + 20480 +#define GRC_REG_TRACE_FIFO_VALID_DATA \ + 0x050064UL +#define GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW \ + 0x05040cUL +#define GRC_REG_PROTECTION_OVERRIDE_WINDOW \ + 0x050500UL +#define IGU_REG_ERROR_HANDLING_MEMORY \ + 0x181520UL #define MCP_REG_CPU_MODE \ 0xe05000UL #define MCP_REG_CPU_MODE_SOFT_HALT \ (0x1 << 10) +#define BRB_REG_BIG_RAM_ADDRESS \ + 0x340800UL +#define BRB_REG_BIG_RAM_DATA \ + 0x341500UL +#define SEM_FAST_REG_STALL_0 \ + 0x000488UL +#define SEM_FAST_REG_STALLED \ + 0x000494UL +#define BTB_REG_BIG_RAM_ADDRESS \ + 0xdb0800UL +#define BTB_REG_BIG_RAM_DATA \ + 0xdb0c00UL +#define BMB_REG_BIG_RAM_ADDRESS \ + 0x540800UL +#define BMB_REG_BIG_RAM_DATA \ + 0x540f00UL +#define SEM_FAST_REG_STORM_REG_FILE \ + 0x008000UL +#define RSS_REG_RSS_RAM_ADDR \ + 0x238c30UL +#define MISCS_REG_BLOCK_256B_EN \ + 0x009074UL +#define MCP_REG_SCRATCH_SIZE \ + 57344 +#define MCP_REG_CPU_REG_FILE \ + 0xe05200UL +#define MCP_REG_CPU_REG_FILE_SIZE \ + 32 +#define DBG_REG_DEBUG_TARGET \ + 0x01005cUL +#define DBG_REG_FULL_MODE \ + 0x010060UL +#define DBG_REG_CALENDAR_OUT_DATA \ + 0x010480UL +#define GRC_REG_TRACE_FIFO \ + 0x050068UL +#define IGU_REG_ERROR_HANDLING_DATA_VALID \ + 0x181530UL +#define DBG_REG_DBG_BLOCK_ON \ + 0x010454UL +#define DBG_REG_FRAMING_MODE \ + 0x010058UL +#define SEM_FAST_REG_VFC_DATA_WR \ + 0x000b40UL +#define SEM_FAST_REG_VFC_ADDR \ + 0x000b44UL +#define SEM_FAST_REG_VFC_DATA_RD \ + 0x000b48UL +#define RSS_REG_RSS_RAM_DATA \ + 0x238c20UL +#define MISC_REG_BLOCK_256B_EN \ + 0x008c14UL +#define NWS_REG_NWS_CMU \ + 0x720000UL +#define PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_7_0 \ + 0x000680UL +#define PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_15_8 \ + 0x000684UL +#define PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_7_0 \ + 0x0006c0UL +#define PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_11_8 \ + 0x0006c4UL +#define MS_REG_MS_CMU \ + 0x6a4000UL +#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X130 \ + 0x000208UL +#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X132 \ + 0x000210UL +#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X131 \ + 0x00020cUL +#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X133 \ + 0x000214UL +#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130 \ + 0x000208UL +#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131 \ + 0x00020cUL +#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132 \ + 0x000210UL +#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133 \ + 0x000214UL +#define PHY_PCIE_REG_PHY0 \ + 0x620000UL +#define PHY_PCIE_REG_PHY1 \ + 0x624000UL #endif diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 70b30e4d3cc4..19027635df0d 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -143,6 +143,9 @@ #define GTT_BYTE_SIZE_BITS (GTT_DWORD_SIZE_BITS + 2) #define GTT_DWORD_SIZE BIT(GTT_DWORD_SIZE_BITS) +/* Tools Version */ +#define TOOLS_VERSION 10 + /*****************/ /* CDU CONSTANTS */ /*****************/ -- cgit v1.2.3 From e0971c832af4cd906ab931c9f6e9e1791a62fc98 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 7 Sep 2016 16:36:25 +0300 Subject: qed*: Add support for the ethtool get_regs operation Signed-off-by: Tomer Tayar Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 2 ++ drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 24 ++++++++++++++++++++++++ include/linux/qed/qed_if.h | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 250efd1d270d..b730a632c383 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1398,6 +1398,8 @@ const struct qed_common_ops qed_common_ops_pass = { .get_link = &qed_get_current_link, .drain = &qed_drain, .update_msglvl = &qed_init_dp, + .dbg_all_data = &qed_dbg_all_data, + .dbg_all_data_size = &qed_dbg_all_data_size, .chain_alloc = &qed_chain_alloc, .chain_free = &qed_chain_free, .get_coalesce = &qed_get_coalesce, diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 14d5328e6ac9..25a9b293ee8f 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -695,6 +695,28 @@ static int qede_set_pauseparam(struct net_device *dev, return 0; } +static void qede_get_regs(struct net_device *ndev, + struct ethtool_regs *regs, void *buffer) +{ + struct qede_dev *edev = netdev_priv(ndev); + + regs->version = 0; + memset(buffer, 0, regs->len); + + if (edev->ops && edev->ops->common) + edev->ops->common->dbg_all_data(edev->cdev, buffer); +} + +static int qede_get_regs_len(struct net_device *ndev) +{ + struct qede_dev *edev = netdev_priv(ndev); + + if (edev->ops && edev->ops->common) + return edev->ops->common->dbg_all_data_size(edev->cdev); + else + return -EINVAL; +} + static void qede_update_mtu(struct qede_dev *edev, union qede_reload_args *args) { edev->ndev->mtu = args->mtu; @@ -1395,6 +1417,8 @@ static const struct ethtool_ops qede_ethtool_ops = { .get_link_ksettings = qede_get_link_ksettings, .set_link_ksettings = qede_set_link_ksettings, .get_drvinfo = qede_get_drvinfo, + .get_regs_len = qede_get_regs_len, + .get_regs = qede_get_regs, .get_msglevel = qede_get_msglevel, .set_msglevel = qede_set_msglevel, .nway_reset = qede_nway_reset, diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index d8dc5c2243d5..e4546abcea08 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -455,6 +455,10 @@ struct qed_common_ops { void (*simd_handler_clean)(struct qed_dev *cdev, int index); + int (*dbg_all_data) (struct qed_dev *cdev, void *buffer); + + int (*dbg_all_data_size) (struct qed_dev *cdev); + /** * @brief can_link_change - can the instance change the link or not * -- cgit v1.2.3 From 18f1387c7d7c6827b3ed6adf6ae20f65a58dc7b0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Sep 2016 11:10:11 +0100 Subject: rxrpc: Update protocol definitions slightly Update the protocol definitions in include/rxrpc/packet.h slightly: (1) Get rid of RXRPC_PROCESS_MAXCALLS as it's redundant (same as RXRPC_MAXCALLS). (2) In struct rxrpc_jumbo_header, put _rsvd in a union with a field called cksum to match struct rxrpc_wire_header. (3) Provide RXRPC_JUMBO_SUBPKTLEN which is the total of the amount of data in a non-terminal subpacket plus the following secondary header for the next packet included in the jumbo packet. Signed-off-by: David Howells --- include/rxrpc/packet.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index 3c6128e1fdbe..b0ae5c1a6ce6 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -34,8 +34,6 @@ struct rxrpc_wire_header { #define RXRPC_CID_INC (1 << RXRPC_CIDSHIFT) /* connection ID increment */ __be32 callNumber; /* call ID (0 for connection-level packets) */ -#define RXRPC_PROCESS_MAXCALLS (1<<2) /* maximum number of active calls per conn (power of 2) */ - __be32 seq; /* sequence number of pkt in call stream */ __be32 serial; /* serial number of pkt sent to network */ @@ -93,10 +91,14 @@ struct rxrpc_wire_header { struct rxrpc_jumbo_header { uint8_t flags; /* packet flags (as per rxrpc_header) */ uint8_t pad; - __be16 _rsvd; /* reserved (used by kerberos security as cksum) */ + union { + __be16 _rsvd; /* reserved */ + __be16 cksum; /* kerberos security checksum */ + }; }; #define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */ +#define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) /*****************************************************************************/ /* -- cgit v1.2.3 From 2ab27215ea27475a0b279732ba8a934bfab57ef0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Sep 2016 11:10:12 +0100 Subject: rxrpc: Remove skb_count from struct rxrpc_call Remove the sk_buff count from the rxrpc_call struct as it's less useful once we stop queueing sk_buffs. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 10 +++------- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_object.c | 34 ++++++++++++---------------------- 3 files changed, 15 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 85ee035774ae..6b06cf050bc0 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -18,16 +18,14 @@ TRACE_EVENT(rxrpc_call, TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op, - int usage, int nskb, - const void *where, const void *aux), + int usage, const void *where, const void *aux), - TP_ARGS(call, op, usage, nskb, where, aux), + TP_ARGS(call, op, usage, where, aux), TP_STRUCT__entry( __field(struct rxrpc_call *, call ) __field(int, op ) __field(int, usage ) - __field(int, nskb ) __field(const void *, where ) __field(const void *, aux ) ), @@ -36,16 +34,14 @@ TRACE_EVENT(rxrpc_call, __entry->call = call; __entry->op = op; __entry->usage = usage; - __entry->nskb = nskb; __entry->where = where; __entry->aux = aux; ), - TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p", + TP_printk("c=%p %s u=%d sp=%pSR a=%p", __entry->call, rxrpc_call_traces[__entry->op], __entry->usage, - __entry->nskb, __entry->where, __entry->aux) ); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fd438dc93ee9..027791261768 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -467,7 +467,6 @@ struct rxrpc_call { enum rxrpc_call_state state; /* current state of call */ enum rxrpc_call_completion completion; /* Call completion condition */ atomic_t usage; - atomic_t skb_count; /* Outstanding packets on this call */ atomic_t sequence; /* Tx data packet sequence counter */ u16 service_id; /* service ID */ u8 security_ix; /* Security type */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9efd9b0b0bdf..f843397e03b6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -232,9 +232,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - trace_rxrpc_call(call, rxrpc_call_new_client, - atomic_read(&call->usage), 0, - here, (const void *)user_call_ID); + trace_rxrpc_call(call, 0, atomic_read(&call->usage), here, + (const void *)user_call_ID); /* Publish the call, even though it is incompletely set up as yet */ call->user_call_ID = user_call_ID; @@ -325,7 +324,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, return ERR_PTR(-EBUSY); trace_rxrpc_call(candidate, rxrpc_call_new_service, - atomic_read(&candidate->usage), 0, here, NULL); + atomic_read(&candidate->usage), here, NULL); chan = sp->hdr.cid & RXRPC_CHANNELMASK; candidate->conn = conn; @@ -446,11 +445,10 @@ bool rxrpc_queue_call(struct rxrpc_call *call) { const void *here = __builtin_return_address(0); int n = __atomic_add_unless(&call->usage, 1, 0); - int m = atomic_read(&call->skb_count); if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued, n + 1, m, here, NULL); + trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -463,10 +461,9 @@ bool __rxrpc_queue_call(struct rxrpc_call *call) { const void *here = __builtin_return_address(0); int n = atomic_read(&call->usage); - int m = atomic_read(&call->skb_count); ASSERTCMP(n, >=, 1); if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued_ref, n, m, here, NULL); + trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -480,9 +477,8 @@ void rxrpc_see_call(struct rxrpc_call *call) const void *here = __builtin_return_address(0); if (call) { int n = atomic_read(&call->usage); - int m = atomic_read(&call->skb_count); - trace_rxrpc_call(call, rxrpc_call_seen, n, m, here, NULL); + trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL); } } @@ -493,9 +489,8 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); - int m = atomic_read(&call->skb_count); - trace_rxrpc_call(call, op, n, m, here, NULL); + trace_rxrpc_call(call, op, n, here, NULL); } /* @@ -505,9 +500,8 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); - int m = atomic_inc_return(&call->skb_count); - trace_rxrpc_call(call, rxrpc_call_got_skb, n, m, here, skb); + trace_rxrpc_call(call, rxrpc_call_got_skb, n, here, skb); } /* @@ -642,17 +636,15 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { const void *here = __builtin_return_address(0); - int n, m; + int n; ASSERT(call != NULL); n = atomic_dec_return(&call->usage); - m = atomic_read(&call->skb_count); - trace_rxrpc_call(call, op, n, m, here, NULL); + trace_rxrpc_call(call, op, n, here, NULL); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); - WARN_ON(m != 0); rxrpc_cleanup_call(call); } } @@ -663,15 +655,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) { const void *here = __builtin_return_address(0); - int n, m; + int n; n = atomic_dec_return(&call->usage); - m = atomic_dec_return(&call->skb_count); - trace_rxrpc_call(call, rxrpc_call_put_skb, n, m, here, skb); + trace_rxrpc_call(call, rxrpc_call_put_skb, n, here, skb); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); - WARN_ON(m != 0); rxrpc_cleanup_call(call); } } -- cgit v1.2.3 From 49e19ec7d3499f79d2b3a45bb28418e89512fd7a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Sep 2016 11:10:12 +0100 Subject: rxrpc: Add tracepoints to record received packets and end of data_ready Add two tracepoints: (1) Record the RxRPC protocol header of packets retrieved from the UDP socket by the data_ready handler. (2) Record the outcome of the data_ready handler. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 38 ++++++++++++++++++++++++++++++++++++++ net/rxrpc/input.c | 8 ++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 6b06cf050bc0..ea3b10ed91a8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -80,6 +80,44 @@ TRACE_EVENT(rxrpc_skb, __entry->where) ); +TRACE_EVENT(rxrpc_rx_packet, + TP_PROTO(struct rxrpc_skb_priv *sp), + + TP_ARGS(sp), + + TP_STRUCT__entry( + __field_struct(struct rxrpc_host_header, hdr ) + ), + + TP_fast_assign( + memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); + ), + + TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x", + __entry->hdr.epoch, __entry->hdr.cid, + __entry->hdr.callNumber, __entry->hdr.serviceId, + __entry->hdr.serial, __entry->hdr.seq, + __entry->hdr.type, __entry->hdr.flags) + ); + +TRACE_EVENT(rxrpc_rx_done, + TP_PROTO(int result, int abort_code), + + TP_ARGS(result, abort_code), + + TP_STRUCT__entry( + __field(int, result ) + __field(int, abort_code ) + ), + + TP_fast_assign( + __entry->result = result; + __entry->abort_code = abort_code; + ), + + TP_printk("r=%d a=%d", __entry->result, __entry->abort_code) + ); + TRACE_EVENT(rxrpc_abort, TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq, int abort_code, int error), diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 8e624109750a..6c4b7df05e95 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -683,6 +683,7 @@ void rxrpc_data_ready(struct sock *sk) /* dig out the RxRPC connection details */ if (rxrpc_extract_header(sp, skb) < 0) goto bad_message; + trace_rxrpc_rx_packet(sp); _net("Rx RxRPC %s ep=%x call=%x:%x", sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", @@ -767,6 +768,7 @@ discard_unlock: out_unlock: rcu_read_unlock(); out: + trace_rxrpc_rx_done(0, 0); return; cant_route_call: @@ -780,7 +782,7 @@ cant_route_call: skb_queue_tail(&local->accept_queue, skb); rxrpc_queue_work(&local->processor); _leave(" [incoming]"); - return; + goto out; } skb->priority = RX_INVALID_OPERATION; } else { @@ -789,7 +791,7 @@ cant_route_call: if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { _debug("reject type %d",sp->hdr.type); - rxrpc_reject_packet(local, skb); + goto reject_packet; } else { rxrpc_free_skb(skb); } @@ -798,6 +800,8 @@ cant_route_call: bad_message: skb->priority = RX_PROTOCOL_ERROR; +reject_packet: + trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); _leave(" [badmsg]"); } -- cgit v1.2.3 From 00e907127e6f86d0f9b122d9b4347a8aa09a8b61 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Sep 2016 11:10:12 +0100 Subject: rxrpc: Preallocate peers, conns and calls for incoming service requests Make it possible for the data_ready handler called from the UDP transport socket to completely instantiate an rxrpc_call structure and make it immediately live by preallocating all the memory it might need. The idea is to cut out the background thread usage as much as possible. [Note that the preallocated structs are not actually used in this patch - that will be done in a future patch.] If insufficient resources are available in the preallocation buffers, it will be possible to discard the DATA packet in the data_ready handler or schedule a BUSY packet without the need to schedule an attempt at allocation in a background thread. To this end: (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a maximum number each of the listen backlog size. The backlog size is limited to a maxmimum of 32. Only this many of each can be in the preallocation buffer. (2) For userspace sockets, the preallocation is charged initially by listen() and will be recharged by accepting or rejecting pending new incoming calls. (3) For kernel services {,re,dis}charging of the preallocation buffers is handled manually. Two notifier callbacks have to be provided before kernel_listen() is invoked: (a) An indication that a new call has been instantiated. This can be used to trigger background recharging. (b) An indication that a call is being discarded. This is used when the socket is being released. A function, rxrpc_kernel_charge_accept() is called by the kernel service to preallocate a single call. It should be passed the user ID to be used for that call and a callback to associate the rxrpc call with the kernel service's side of the ID. (4) Discard the preallocation when the socket is closed. (5) Temporarily bump the refcount on the call allocated in rxrpc_incoming_call() so that rxrpc_release_call() can ditch the preallocation ref on service calls unconditionally. This will no longer be necessary once the preallocation is used. Note that this does not yet control the number of active service calls on a client - that will come in a later patch. A future development would be to provide a setsockopt() call that allows a userspace server to manually charge the preallocation buffer. This would allow user call IDs to be provided in advance and the awkward manual accept stage to be bypassed. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 71 ++++++++++++++- include/net/af_rxrpc.h | 10 ++- net/rxrpc/af_rxrpc.c | 16 +++- net/rxrpc/ar-internal.h | 32 ++++++- net/rxrpc/call_accept.c | 229 +++++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/call_object.c | 12 ++- net/rxrpc/conn_object.c | 2 + net/rxrpc/conn_service.c | 24 +++++ net/rxrpc/input.c | 2 +- net/rxrpc/proc.c | 8 +- 10 files changed, 391 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 53750dece80e..720ef05a24fe 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -18,6 +18,7 @@ struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; +static struct afs_call *afs_spare_incoming_call; static atomic_t afs_outstanding_calls; static void afs_free_call(struct afs_call *); @@ -26,7 +27,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static int afs_dont_wait_for_call_to_complete(struct afs_call *); static void afs_process_async_call(struct work_struct *); -static void afs_rx_new_call(struct sock *); +static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); static int afs_deliver_cm_op_id(struct afs_call *); /* synchronous call management */ @@ -54,8 +56,10 @@ static const struct afs_call_type afs_RXCMxxxx = { }; static void afs_collect_incoming_call(struct work_struct *); +static void afs_charge_preallocation(struct work_struct *); static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); +static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); static int afs_wait_atomic_t(atomic_t *p) { @@ -100,13 +104,15 @@ int afs_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call); + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, + afs_rx_discard_new_call); ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; afs_socket = socket; + afs_charge_preallocation(NULL); _leave(" = 0"); return 0; @@ -126,6 +132,12 @@ void afs_close_socket(void) { _enter(""); + if (afs_spare_incoming_call) { + atomic_inc(&afs_outstanding_calls); + afs_free_call(afs_spare_incoming_call); + afs_spare_incoming_call = NULL; + } + _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, TASK_UNINTERRUPTIBLE); @@ -635,12 +647,65 @@ static void afs_collect_incoming_call(struct work_struct *work) afs_free_call(call); } +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; + + call->rxcall = rxcall; +} + +/* + * Charge the incoming call preallocation. + */ +static void afs_charge_preallocation(struct work_struct *work) +{ + struct afs_call *call = afs_spare_incoming_call; + + for (;;) { + if (!call) { + call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); + if (!call) + break; + + INIT_WORK(&call->async_work, afs_process_async_call); + call->wait_mode = &afs_async_incoming_call; + call->type = &afs_RXCMxxxx; + init_waitqueue_head(&call->waitq); + call->state = AFS_CALL_AWAIT_OP_ID; + } + + if (rxrpc_kernel_charge_accept(afs_socket, + afs_wake_up_async_call, + afs_rx_attach, + (unsigned long)call, + GFP_KERNEL) < 0) + break; + call = NULL; + } + afs_spare_incoming_call = call; +} + +/* + * Discard a preallocated call when a socket is shut down. + */ +static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; + + atomic_inc(&afs_outstanding_calls); + call->rxcall = NULL; + afs_free_call(call); +} + /* * Notification of an incoming call. */ -static void afs_rx_new_call(struct sock *sk) +static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) { queue_work(afs_wq, &afs_collect_incoming_call_work); + queue_work(afs_wq, &afs_charge_preallocation_work); } /* diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 08ed8729126c..9cf551be916b 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -21,10 +21,14 @@ struct rxrpc_call; typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *); +typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, + unsigned long); +typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); +typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t); + rxrpc_notify_new_call_t, + rxrpc_discard_new_call_t); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, struct sockaddr_rxrpc *, struct key *, @@ -43,5 +47,7 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long, int rxrpc_kernel_reject_call(struct socket *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); +int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, + rxrpc_user_attach_call_t, unsigned long, gfp_t); #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index f13cca1e973e..1e8cf3ded81f 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -193,7 +193,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - unsigned int max; + unsigned int max, old; int ret; _enter("%p,%d", rx, backlog); @@ -212,9 +212,13 @@ static int rxrpc_listen(struct socket *sock, int backlog) backlog = max; else if (backlog < 0 || backlog > max) break; + old = sk->sk_max_ack_backlog; sk->sk_max_ack_backlog = backlog; - rx->sk.sk_state = RXRPC_SERVER_LISTENING; - ret = 0; + ret = rxrpc_service_prealloc(rx, GFP_KERNEL); + if (ret == 0) + rx->sk.sk_state = RXRPC_SERVER_LISTENING; + else + sk->sk_max_ack_backlog = old; break; default: ret = -EBUSY; @@ -303,16 +307,19 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * rxrpc_kernel_new_call_notification - Get notifications of new calls * @sock: The socket to intercept received messages on * @notify_new_call: Function to be called when new calls appear + * @discard_new_call: Function to discard preallocated calls * * Allow a kernel service to be given notifications about new calls. */ void rxrpc_kernel_new_call_notification( struct socket *sock, - rxrpc_notify_new_call_t notify_new_call) + rxrpc_notify_new_call_t notify_new_call, + rxrpc_discard_new_call_t discard_new_call) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); rx->notify_new_call = notify_new_call; + rx->discard_new_call = discard_new_call; } EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); @@ -622,6 +629,7 @@ static int rxrpc_release_sock(struct sock *sk) } /* try to flush out this socket */ + rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 027791261768..45e1c269f90e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -63,6 +63,27 @@ enum { RXRPC_CLOSE, /* socket is being closed */ }; +/* + * Service backlog preallocation. + * + * This contains circular buffers of preallocated peers, connections and calls + * for incoming service calls and their head and tail pointers. This allows + * calls to be set up in the data_ready handler, thereby avoiding the need to + * shuffle packets around so much. + */ +struct rxrpc_backlog { + unsigned short peer_backlog_head; + unsigned short peer_backlog_tail; + unsigned short conn_backlog_head; + unsigned short conn_backlog_tail; + unsigned short call_backlog_head; + unsigned short call_backlog_tail; +#define RXRPC_BACKLOG_MAX 32 + struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX]; + struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX]; + struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX]; +}; + /* * RxRPC socket definition */ @@ -70,13 +91,15 @@ struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ + rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ struct rxrpc_local *local; /* local endpoint */ struct hlist_node listen_link; /* link in the local endpoint's listen list */ struct list_head secureq; /* calls awaiting connection security clearance */ struct list_head acceptq; /* calls awaiting acceptance */ + struct rxrpc_backlog *backlog; /* Preallocation for services */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ - struct rb_root calls; /* outstanding calls on this socket */ + struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ rwlock_t call_lock; /* lock for calls */ @@ -290,6 +313,7 @@ enum rxrpc_conn_cache_state { enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ RXRPC_CONN_CLIENT, /* Client connection */ + RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ @@ -408,6 +432,7 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ + RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ @@ -534,6 +559,8 @@ extern struct workqueue_struct *rxrpc_workqueue; /* * call_accept.c */ +int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); +void rxrpc_discard_prealloc(struct rxrpc_sock *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long, rxrpc_notify_rx_t); @@ -557,6 +584,7 @@ extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); +struct rxrpc_call *rxrpc_alloc_call(gfp_t); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, @@ -573,6 +601,7 @@ void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *); void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *); +void rxrpc_cleanup_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) @@ -757,6 +786,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, struct sockaddr_rxrpc *, struct sk_buff *); +struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 4c71efcf82ed..cc7194e05a15 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -20,11 +20,209 @@ #include #include #include +#include #include #include #include #include "ar-internal.h" +/* + * Preallocate a single service call, connection and peer and, if possible, + * give them a user ID and attach the user's side of the ID to them. + */ +static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, + struct rxrpc_backlog *b, + rxrpc_notify_rx_t notify_rx, + rxrpc_user_attach_call_t user_attach_call, + unsigned long user_call_ID, gfp_t gfp) +{ + const void *here = __builtin_return_address(0); + struct rxrpc_call *call; + int max, tmp; + unsigned int size = RXRPC_BACKLOG_MAX; + unsigned int head, tail, call_head, call_tail; + + max = rx->sk.sk_max_ack_backlog; + tmp = rx->sk.sk_ack_backlog; + if (tmp >= max) { + _leave(" = -ENOBUFS [full %u]", max); + return -ENOBUFS; + } + max -= tmp; + + /* We don't need more conns and peers than we have calls, but on the + * other hand, we shouldn't ever use more peers than conns or conns + * than calls. + */ + call_head = b->call_backlog_head; + call_tail = READ_ONCE(b->call_backlog_tail); + tmp = CIRC_CNT(call_head, call_tail, size); + if (tmp >= max) { + _leave(" = -ENOBUFS [enough %u]", tmp); + return -ENOBUFS; + } + max = tmp + 1; + + head = b->peer_backlog_head; + tail = READ_ONCE(b->peer_backlog_tail); + if (CIRC_CNT(head, tail, size) < max) { + struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp); + if (!peer) + return -ENOMEM; + b->peer_backlog[head] = peer; + smp_store_release(&b->peer_backlog_head, + (head + 1) & (size - 1)); + } + + head = b->conn_backlog_head; + tail = READ_ONCE(b->conn_backlog_tail); + if (CIRC_CNT(head, tail, size) < max) { + struct rxrpc_connection *conn; + + conn = rxrpc_prealloc_service_connection(gfp); + if (!conn) + return -ENOMEM; + b->conn_backlog[head] = conn; + smp_store_release(&b->conn_backlog_head, + (head + 1) & (size - 1)); + } + + /* Now it gets complicated, because calls get registered with the + * socket here, particularly if a user ID is preassigned by the user. + */ + call = rxrpc_alloc_call(gfp); + if (!call) + return -ENOMEM; + call->flags |= (1 << RXRPC_CALL_IS_SERVICE); + call->state = RXRPC_CALL_SERVER_PREALLOC; + + trace_rxrpc_call(call, rxrpc_call_new_service, + atomic_read(&call->usage), + here, (const void *)user_call_ID); + + write_lock(&rx->call_lock); + if (user_attach_call) { + struct rxrpc_call *xcall; + struct rb_node *parent, **pp; + + /* Check the user ID isn't already in use */ + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + xcall = rb_entry(parent, struct rxrpc_call, sock_node); + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto id_in_use; + } + + call->user_call_ID = user_call_ID; + call->notify_rx = notify_rx; + rxrpc_get_call(call, rxrpc_call_got); + user_attach_call(call, user_call_ID); + rxrpc_get_call(call, rxrpc_call_got_userid); + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + set_bit(RXRPC_CALL_HAS_USERID, &call->flags); + } + + write_unlock(&rx->call_lock); + + write_lock(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock(&rxrpc_call_lock); + + b->call_backlog[call_head] = call; + smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); + _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID); + return 0; + +id_in_use: + write_unlock(&rx->call_lock); + rxrpc_cleanup_call(call); + _leave(" = -EBADSLT"); + return -EBADSLT; +} + +/* + * Preallocate sufficient service connections, calls and peers to cover the + * entire backlog of a socket. When a new call comes in, if we don't have + * sufficient of each available, the call gets rejected as busy or ignored. + * + * The backlog is replenished when a connection is accepted or rejected. + */ +int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) +{ + struct rxrpc_backlog *b = rx->backlog; + + if (!b) { + b = kzalloc(sizeof(struct rxrpc_backlog), gfp); + if (!b) + return -ENOMEM; + rx->backlog = b; + } + + if (rx->discard_new_call) + return 0; + + while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0) + ; + + return 0; +} + +/* + * Discard the preallocation on a service. + */ +void rxrpc_discard_prealloc(struct rxrpc_sock *rx) +{ + struct rxrpc_backlog *b = rx->backlog; + unsigned int size = RXRPC_BACKLOG_MAX, head, tail; + + if (!b) + return; + rx->backlog = NULL; + + head = b->peer_backlog_head; + tail = b->peer_backlog_tail; + while (CIRC_CNT(head, tail, size) > 0) { + struct rxrpc_peer *peer = b->peer_backlog[tail]; + kfree(peer); + tail = (tail + 1) & (size - 1); + } + + head = b->conn_backlog_head; + tail = b->conn_backlog_tail; + while (CIRC_CNT(head, tail, size) > 0) { + struct rxrpc_connection *conn = b->conn_backlog[tail]; + write_lock(&rxrpc_connection_lock); + list_del(&conn->link); + list_del(&conn->proc_link); + write_unlock(&rxrpc_connection_lock); + kfree(conn); + tail = (tail + 1) & (size - 1); + } + + head = b->call_backlog_head; + tail = b->call_backlog_tail; + while (CIRC_CNT(head, tail, size) > 0) { + struct rxrpc_call *call = b->call_backlog[tail]; + if (rx->discard_new_call) { + _debug("discard %lx", call->user_call_ID); + rx->discard_new_call(call, call->user_call_ID); + } + rxrpc_call_completed(call); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + tail = (tail + 1) & (size - 1); + } + + kfree(b); +} + /* * generate a connection-level abort */ @@ -450,3 +648,34 @@ int rxrpc_kernel_reject_call(struct socket *sock) return ret; } EXPORT_SYMBOL(rxrpc_kernel_reject_call); + +/* + * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls + * @sock: The socket on which to preallocate + * @notify_rx: Event notification function for the call + * @user_attach_call: Func to attach call to user_call_ID + * @user_call_ID: The tag to attach to the preallocated call + * @gfp: The allocation conditions. + * + * Charge up the socket with preallocated calls, each with a user ID. A + * function should be provided to effect the attachment from the user's side. + * The user is given a ref to hold on the call. + * + * Note that the call may be come connected before this function returns. + */ +int rxrpc_kernel_charge_accept(struct socket *sock, + rxrpc_notify_rx_t notify_rx, + rxrpc_user_attach_call_t user_attach_call, + unsigned long user_call_ID, gfp_t gfp) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct rxrpc_backlog *b = rx->backlog; + + if (sock->sk->sk_state == RXRPC_CLOSE) + return -ESHUTDOWN; + + return rxrpc_service_prealloc_one(rx, b, notify_rx, + user_attach_call, user_call_ID, + gfp); +} +EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f843397e03b6..d233adc9b5e5 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -31,6 +31,7 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", + [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", @@ -71,7 +72,6 @@ DEFINE_RWLOCK(rxrpc_call_lock); static void rxrpc_call_life_expired(unsigned long _call); static void rxrpc_ack_time_expired(unsigned long _call); static void rxrpc_resend_time_expired(unsigned long _call); -static void rxrpc_cleanup_call(struct rxrpc_call *call); /* * find an extant server call @@ -113,7 +113,7 @@ found_extant_call: /* * allocate a new call */ -static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) { struct rxrpc_call *call; @@ -392,6 +392,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, if (call_id <= conn->channels[chan].call_counter) goto old_call; /* TODO: Just drop packet */ + /* Temporary: Mirror the backlog prealloc ref (TODO: use prealloc) */ + rxrpc_get_call(candidate, rxrpc_call_got); + /* make the call available */ _debug("new call"); call = candidate; @@ -596,6 +599,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) del_timer_sync(&call->ack_timer); del_timer_sync(&call->lifetimer); + /* We have to release the prealloc backlog ref */ + if (rxrpc_is_service_call(call)) + rxrpc_put_call(call, rxrpc_call_put); _leave(""); } @@ -682,7 +688,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) /* * clean up a call */ -static void rxrpc_cleanup_call(struct rxrpc_call *call) +void rxrpc_cleanup_call(struct rxrpc_call *call) { _net("DESTROY CALL %d", call->debug_id); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 9c6685b97e70..8da82e3aa00e 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -286,6 +286,8 @@ static void rxrpc_connection_reaper(struct work_struct *work) ASSERTCMP(atomic_read(&conn->usage), >, 0); if (likely(atomic_read(&conn->usage) > 1)) continue; + if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) + continue; idle_timestamp = READ_ONCE(conn->idle_timestamp); _debug("reap CONN %d { u=%d,t=%ld }", diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 316a92107fee..189338a60457 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -118,6 +118,30 @@ replace_old_connection: goto conn_published; } +/* + * Preallocate a service connection. The connection is placed on the proc and + * reap lists so that we don't have to get the lock from BH context. + */ +struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) +{ + struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp); + + if (conn) { + /* We maintain an extra ref on the connection whilst it is on + * the rxrpc_connections list. + */ + conn->state = RXRPC_CONN_SERVICE_PREALLOC; + atomic_set(&conn->usage, 2); + + write_lock(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); + write_unlock(&rxrpc_connection_lock); + } + + return conn; +} + /* * get a record of an incoming connection */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 6c4b7df05e95..5906579060cd 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -102,7 +102,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, rx->notify_new_call) { spin_unlock_bh(&sk->sk_receive_queue.lock); skb_queue_tail(&call->knlrecv_queue, skb); - rx->notify_new_call(&rx->sk); + rx->notify_new_call(&rx->sk, NULL, 0); } else if (call->notify_rx) { spin_unlock_bh(&sk->sk_receive_queue.lock); skb_queue_tail(&call->knlrecv_queue, skb); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index dfad23821a62..d529d1b4021c 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -17,6 +17,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { [RXRPC_CONN_UNUSED] = "Unused ", [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc", [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", [RXRPC_CONN_SERVICE] = "SvSecure", @@ -156,6 +157,11 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) } conn = list_entry(v, struct rxrpc_connection, proc_link); + if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) { + strcpy(lbuff, "no_local"); + strcpy(rbuff, "no_connection"); + goto print; + } sprintf(lbuff, "%pI4:%u", &conn->params.local->srx.transport.sin.sin_addr, @@ -164,7 +170,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) sprintf(rbuff, "%pI4:%u", &conn->params.peer->srx.transport.sin.sin_addr, ntohs(conn->params.peer->srx.transport.sin.sin_port)); - +print: seq_printf(seq, "UDP %-22.22s %-22.22s %4x %08x %s %3u" " %s %08x %08x %08x\n", -- cgit v1.2.3 From 248f219cb8bcbfbd7f132752d44afa2df7c241d1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Sep 2016 11:10:12 +0100 Subject: rxrpc: Rewrite the data and ack handling code Rewrite the data and ack handling code such that: (1) Parsing of received ACK and ABORT packets and the distribution and the filing of DATA packets happens entirely within the data_ready context called from the UDP socket. This allows us to process and discard ACK and ABORT packets much more quickly (they're no longer stashed on a queue for a background thread to process). (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead keep track of the offset and length of the content of each packet in the sk_buff metadata. This means we don't do any allocation in the receive path. (3) Jumbo DATA packet parsing is now done in data_ready context. Rather than cloning the packet once for each subpacket and pulling/trimming it, we file the packet multiple times with an annotation for each indicating which subpacket is there. From that we can directly calculate the offset and length. (4) A call's receive queue can be accessed without taking locks (memory barriers do have to be used, though). (5) Incoming calls are set up from preallocated resources and immediately made live. They can than have packets queued upon them and ACKs generated. If insufficient resources exist, DATA packet #1 is given a BUSY reply and other DATA packets are discarded). (6) sk_buffs no longer take a ref on their parent call. To make this work, the following changes are made: (1) Each call's receive buffer is now a circular buffer of sk_buff pointers (rxtx_buffer) rather than a number of sk_buff_heads spread between the call and the socket. This permits each sk_buff to be in the buffer multiple times. The receive buffer is reused for the transmit buffer. (2) A circular buffer of annotations (rxtx_annotations) is kept parallel to the data buffer. Transmission phase annotations indicate whether a buffered packet has been ACK'd or not and whether it needs retransmission. Receive phase annotations indicate whether a slot holds a whole packet or a jumbo subpacket and, if the latter, which subpacket. They also note whether the packet has been decrypted in place. (3) DATA packet window tracking is much simplified. Each phase has just two numbers representing the window (rx_hard_ack/rx_top and tx_hard_ack/tx_top). The hard_ack number is the sequence number before base of the window, representing the last packet the other side says it has consumed. hard_ack starts from 0 and the first packet is sequence number 1. The top number is the sequence number of the highest-numbered packet residing in the buffer. Packets between hard_ack+1 and top are soft-ACK'd to indicate they've been received, but not yet consumed. Four macros, before(), before_eq(), after() and after_eq() are added to compare sequence numbers within the window. This allows for the top of the window to wrap when the hard-ack sequence number gets close to the limit. Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also to indicate when rx_top and tx_top point at the packets with the LAST_PACKET bit set, indicating the end of the phase. (4) Calls are queued on the socket 'receive queue' rather than packets. This means that we don't need have to invent dummy packets to queue to indicate abnormal/terminal states and we don't have to keep metadata packets (such as ABORTs) around (5) The offset and length of a (sub)packet's content are now passed to the verify_packet security op. This is currently expected to decrypt the packet in place and validate it. However, there's now nowhere to store the revised offset and length of the actual data within the decrypted blob (there may be a header and padding to skip) because an sk_buff may represent multiple packets, so a locate_data security op is added to retrieve these details from the sk_buff content when needed. (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is individually secured and needs to be individually decrypted. The code to do this is broken out into rxrpc_recvmsg_data() and shared with the kernel API. It now iterates over the call's receive buffer rather than walking the socket receive queue. Additional changes: (1) The timers are condensed to a single timer that is set for the soonest of three timeouts (delayed ACK generation, DATA retransmission and call lifespan). (2) Transmission of ACK and ABORT packets is effected immediately from process-context socket ops/kernel API calls that cause them instead of them being punted off to a background work item. The data_ready handler still has to defer to the background, though. (3) A shutdown op is added to the AF_RXRPC socket so that the AFS filesystem can shut down the socket and flush its own work items before closing the socket to deal with any in-progress service calls. Future additional changes that will need to be considered: (1) Make sure that a call doesn't hog the front of the queue by receiving data from the network as fast as userspace is consuming it to the exclusion of other calls. (2) Transmit delayed ACKs from within recvmsg() when we've consumed sufficiently more packets to avoid the background work item needing to run. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 51 +- include/net/af_rxrpc.h | 3 - include/rxrpc/packet.h | 7 + net/rxrpc/af_rxrpc.c | 57 +- net/rxrpc/ar-internal.h | 177 +++--- net/rxrpc/call_accept.c | 472 +++++++--------- net/rxrpc/call_event.c | 1357 +++++++--------------------------------------- net/rxrpc/call_object.c | 535 +++++------------- net/rxrpc/conn_event.c | 137 +---- net/rxrpc/conn_object.c | 6 +- net/rxrpc/conn_service.c | 101 +--- net/rxrpc/input.c | 1044 ++++++++++++++++++----------------- net/rxrpc/insecure.c | 13 +- net/rxrpc/local_event.c | 2 +- net/rxrpc/local_object.c | 7 - net/rxrpc/misc.c | 2 +- net/rxrpc/output.c | 125 ++++- net/rxrpc/peer_event.c | 17 +- net/rxrpc/peer_object.c | 82 ++- net/rxrpc/recvmsg.c | 764 ++++++++++++++------------ net/rxrpc/rxkad.c | 108 +++- net/rxrpc/security.c | 10 +- net/rxrpc/sendmsg.c | 126 ++--- net/rxrpc/skbuff.c | 127 ----- 24 files changed, 1993 insertions(+), 3337 deletions(-) (limited to 'include') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 720ef05a24fe..59bdaa7527b6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -55,10 +55,8 @@ static const struct afs_call_type afs_RXCMxxxx = { .abort_to_error = afs_abort_to_error, }; -static void afs_collect_incoming_call(struct work_struct *); static void afs_charge_preallocation(struct work_struct *); -static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); static int afs_wait_atomic_t(atomic_t *p) @@ -143,6 +141,8 @@ void afs_close_socket(void) TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); + flush_workqueue(afs_async_calls); + kernel_sock_shutdown(afs_socket, SHUT_RDWR); flush_workqueue(afs_async_calls); sock_release(afs_socket); @@ -602,51 +602,6 @@ static void afs_process_async_call(struct work_struct *work) _leave(""); } -/* - * accept the backlog of incoming calls - */ -static void afs_collect_incoming_call(struct work_struct *work) -{ - struct rxrpc_call *rxcall; - struct afs_call *call = NULL; - - _enter(""); - - do { - if (!call) { - call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); - if (!call) { - rxrpc_kernel_reject_call(afs_socket); - return; - } - - INIT_WORK(&call->async_work, afs_process_async_call); - call->wait_mode = &afs_async_incoming_call; - call->type = &afs_RXCMxxxx; - init_waitqueue_head(&call->waitq); - call->state = AFS_CALL_AWAIT_OP_ID; - - _debug("CALL %p{%s} [%d]", - call, call->type->name, - atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); - } - - rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long)call, - afs_wake_up_async_call); - if (!IS_ERR(rxcall)) { - call->rxcall = rxcall; - call->need_attention = true; - queue_work(afs_async_calls, &call->async_work); - call = NULL; - } - } while (!call); - - if (call) - afs_free_call(call); -} - static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_call *call = (struct afs_call *)user_call_ID; @@ -704,7 +659,7 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { - queue_work(afs_wq, &afs_collect_incoming_call_work); + atomic_inc(&afs_outstanding_calls); queue_work(afs_wq, &afs_charge_preallocation_work); } diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 9cf551be916b..1061a472a3e3 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -42,9 +42,6 @@ int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); -struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long, - rxrpc_notify_rx_t); -int rxrpc_kernel_reject_call(struct socket *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index b0ae5c1a6ce6..fd6eb3a60a8c 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -133,6 +133,13 @@ struct rxrpc_ackpacket { } __packed; +/* Some ACKs refer to specific packets and some are general and can be updated. */ +#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED) | \ + (1 << RXRPC_ACK_PING_RESPONSE) | \ + (1 << RXRPC_ACK_DELAY) | \ + (1 << RXRPC_ACK_IDLE)) + + /* * ACK packets can have a further piece of information tagged on the end */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 1e8cf3ded81f..caa226dd436e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -155,7 +155,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) } if (rx->srx.srx_service) { - write_lock_bh(&local->services_lock); + write_lock(&local->services_lock); hlist_for_each_entry(prx, &local->services, listen_link) { if (prx->srx.srx_service == rx->srx.srx_service) goto service_in_use; @@ -163,7 +163,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) rx->local = local; hlist_add_head_rcu(&rx->listen_link, &local->services); - write_unlock_bh(&local->services_lock); + write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; } else { @@ -176,7 +176,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) return 0; service_in_use: - write_unlock_bh(&local->services_lock); + write_unlock(&local->services_lock); rxrpc_put_local(local); ret = -EADDRINUSE; error_unlock: @@ -515,15 +515,16 @@ error: static unsigned int rxrpc_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; struct sock *sk = sock->sk; + struct rxrpc_sock *rx = rxrpc_sk(sk); + unsigned int mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!list_empty(&rx->recvmsg_q)) mask |= POLLIN | POLLRDNORM; /* the socket is writable if there is space to add new data to the @@ -575,8 +576,11 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; INIT_HLIST_NODE(&rx->listen_link); - INIT_LIST_HEAD(&rx->secureq); - INIT_LIST_HEAD(&rx->acceptq); + spin_lock_init(&rx->incoming_lock); + INIT_LIST_HEAD(&rx->sock_calls); + INIT_LIST_HEAD(&rx->to_be_accepted); + INIT_LIST_HEAD(&rx->recvmsg_q); + rwlock_init(&rx->recvmsg_lock); rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); @@ -584,6 +588,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, return 0; } +/* + * Kill all the calls on a socket and shut it down. + */ +static int rxrpc_shutdown(struct socket *sock, int flags) +{ + struct sock *sk = sock->sk; + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret = 0; + + _enter("%p,%d", sk, flags); + + if (flags != SHUT_RDWR) + return -EOPNOTSUPP; + if (sk->sk_state == RXRPC_CLOSE) + return -ESHUTDOWN; + + lock_sock(sk); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (sk->sk_state < RXRPC_CLOSE) { + sk->sk_state = RXRPC_CLOSE; + sk->sk_shutdown = SHUTDOWN_MASK; + } else { + ret = -ESHUTDOWN; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + rxrpc_discard_prealloc(rx); + + release_sock(sk); + return ret; +} + /* * RxRPC socket destructor */ @@ -623,9 +660,9 @@ static int rxrpc_release_sock(struct sock *sk) ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1); if (!hlist_unhashed(&rx->listen_link)) { - write_lock_bh(&rx->local->services_lock); + write_lock(&rx->local->services_lock); hlist_del_rcu(&rx->listen_link); - write_unlock_bh(&rx->local->services_lock); + write_unlock(&rx->local->services_lock); } /* try to flush out this socket */ @@ -678,7 +715,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, - .shutdown = sock_no_shutdown, + .shutdown = rxrpc_shutdown, .setsockopt = rxrpc_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = rxrpc_sendmsg, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 45e1c269f90e..b1cb79ec4e96 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -94,9 +94,12 @@ struct rxrpc_sock { rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ struct rxrpc_local *local; /* local endpoint */ struct hlist_node listen_link; /* link in the local endpoint's listen list */ - struct list_head secureq; /* calls awaiting connection security clearance */ - struct list_head acceptq; /* calls awaiting acceptance */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ + struct list_head sock_calls; /* List of calls owned by this socket */ + struct list_head to_be_accepted; /* calls awaiting acceptance */ + struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ + rwlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* User ID -> call mapping */ @@ -138,13 +141,16 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - struct rxrpc_call *call; /* call with which associated */ - unsigned long resend_at; /* time in jiffies at which to resend */ + union { + unsigned long resend_at; /* time in jiffies at which to resend */ + struct { + u8 nr_jumbo; /* Number of jumbo subpackets */ + }; + }; union { unsigned int offset; /* offset into buffer of next read */ int remain; /* amount of space remaining for next write */ u32 error; /* network error code */ - bool need_resend; /* T if needs resending */ }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ @@ -179,7 +185,11 @@ struct rxrpc_security { /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, - rxrpc_seq_t, u16); + unsigned int, unsigned int, rxrpc_seq_t, u16); + + /* Locate the data in a received packet that has been verified. */ + void (*locate_data)(struct rxrpc_call *, struct sk_buff *, + unsigned int *, unsigned int *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); @@ -211,7 +221,6 @@ struct rxrpc_local { struct work_struct processor; struct hlist_head services; /* services listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ - struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ struct rb_root client_conns; /* Client connections by socket params */ @@ -388,38 +397,21 @@ struct rxrpc_connection { */ enum rxrpc_call_flag { RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ - RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */ - RXRPC_CALL_RCVD_LAST, /* all packets received */ - RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */ - RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */ - RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */ RXRPC_CALL_HAS_USERID, /* has a user ID attached */ - RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ RXRPC_CALL_IS_SERVICE, /* Call is service call */ RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ - RXRPC_CALL_RX_NO_MORE, /* Don't indicate MSG_MORE from recvmsg() */ + RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ + RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ }; /* * Events that can be raised on a call. */ enum rxrpc_call_event { - RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */ - RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */ - RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */ - RXRPC_CALL_EV_RCVD_ERROR, /* network error received */ - RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */ RXRPC_CALL_EV_ACK, /* need to generate ACK */ - RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */ RXRPC_CALL_EV_ABORT, /* need to generate abort */ - RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */ - RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */ + RXRPC_CALL_EV_TIMER, /* Timer expired */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ - RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */ - RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */ - RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */ - RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */ - RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */ }; /* @@ -431,7 +423,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ - RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ @@ -448,7 +439,6 @@ enum rxrpc_call_state { */ enum rxrpc_call_completion { RXRPC_CALL_SUCCEEDED, /* - Normal termination */ - RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ @@ -465,24 +455,23 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ - struct timer_list lifetimer; /* lifetime remaining on call */ - struct timer_list ack_timer; /* ACK generation timer */ - struct timer_list resend_timer; /* Tx resend timer */ - struct work_struct processor; /* packet processor and ACK generator */ + unsigned long ack_at; /* When deferred ACK needs to happen */ + unsigned long resend_at; /* When next resend needs to happen */ + unsigned long expire_at; /* When the call times out */ + struct timer_list timer; /* Combined event timer */ + struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ struct list_head chan_wait_link; /* Link in conn->waiting_calls */ struct hlist_node error_link; /* link in error distribution list */ - struct list_head accept_link; /* calls awaiting acceptance */ - struct rb_node sock_node; /* node in socket call tree */ - struct sk_buff_head rx_queue; /* received packets */ - struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ - struct sk_buff_head knlrecv_queue; /* Queue for kernel_recv [TODO: replace this] */ + struct list_head accept_link; /* Link in rx->acceptq */ + struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ + struct list_head sock_link; /* Link in rx->sock_calls */ + struct rb_node sock_node; /* Node in rx->calls */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ - unsigned long creation_jif; /* time of call creation */ unsigned long flags; unsigned long events; spinlock_t lock; @@ -492,40 +481,55 @@ struct rxrpc_call { enum rxrpc_call_state state; /* current state of call */ enum rxrpc_call_completion completion; /* Call completion condition */ atomic_t usage; - atomic_t sequence; /* Tx data packet sequence counter */ u16 service_id; /* service ID */ u8 security_ix; /* Security type */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ - /* transmission-phase ACK management */ - u8 acks_head; /* offset into window of first entry */ - u8 acks_tail; /* offset into window of last entry */ - u8 acks_winsz; /* size of un-ACK'd window */ - u8 acks_unacked; /* lowest unacked packet in last ACK received */ - int acks_latest; /* serial number of latest ACK received */ - rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */ - unsigned long *acks_window; /* sent packet window - * - elements are pointers with LSB set if ACK'd + /* Rx/Tx circular buffer, depending on phase. + * + * In the Rx phase, packets are annotated with 0 or the number of the + * segment of a jumbo packet each buffer refers to. There can be up to + * 47 segments in a maximum-size UDP packet. + * + * In the Tx phase, packets are annotated with which buffers have been + * acked. + */ +#define RXRPC_RXTX_BUFF_SIZE 64 +#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1) + struct sk_buff **rxtx_buffer; + u8 *rxtx_annotations; +#define RXRPC_TX_ANNO_ACK 0 +#define RXRPC_TX_ANNO_UNACK 1 +#define RXRPC_TX_ANNO_NAK 2 +#define RXRPC_TX_ANNO_RETRANS 3 +#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */ +#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */ +#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ + rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but + * not hard-ACK'd packet follows this. + */ + rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not + * consumed packet follows this. */ + rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ + rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ + u8 rx_winsize; /* Size of Rx window */ + u8 tx_winsize; /* Maximum size of Tx window */ + u8 nr_jumbo_dup; /* Number of jumbo duplicates */ /* receive-phase ACK management */ - rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */ - rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */ - rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */ - rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */ - rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */ - rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ - rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ u8 ackr_reason; /* reason to ACK */ u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ - atomic_t ackr_not_idle; /* number of packets in Rx queue */ + rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ + unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ + unsigned short rx_pkt_len; /* Current recvmsg packet len */ - /* received packet records, 1 bit per record */ -#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) - unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; + /* transmission-phase ACK management */ + rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ }; enum rxrpc_call_trace { @@ -535,10 +539,8 @@ enum rxrpc_call_trace { rxrpc_call_queued_ref, rxrpc_call_seen, rxrpc_call_got, - rxrpc_call_got_skb, rxrpc_call_got_userid, rxrpc_call_put, - rxrpc_call_put_skb, rxrpc_call_put_userid, rxrpc_call_put_noqueue, rxrpc_call__nr_trace @@ -561,6 +563,9 @@ extern struct workqueue_struct *rxrpc_workqueue; */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); +struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, + struct rxrpc_connection *, + struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long, rxrpc_notify_rx_t); @@ -569,8 +574,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool); void rxrpc_process_call(struct work_struct *); /* @@ -589,9 +593,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, unsigned long, gfp_t); -struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, - struct rxrpc_connection *, - struct sk_buff *); +void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, + struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); bool __rxrpc_queue_call(struct rxrpc_call *); @@ -599,8 +602,6 @@ bool rxrpc_queue_call(struct rxrpc_call *); void rxrpc_see_call(struct rxrpc_call *); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); -void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *); -void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *); void rxrpc_cleanup_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); @@ -672,13 +673,8 @@ static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, { trace_rxrpc_abort(why, call->cid, call->call_id, seq, abort_code, error); - if (__rxrpc_set_call_completion(call, - RXRPC_CALL_LOCALLY_ABORTED, - abort_code, error)) { - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - return true; - } - return false; + return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error); } static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, @@ -713,8 +709,6 @@ void __exit rxrpc_destroy_all_client_connections(void); * conn_event.c */ void rxrpc_process_connection(struct work_struct *); -void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); -void rxrpc_reject_packets(struct rxrpc_local *); /* * conn_object.c @@ -783,18 +777,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, - struct sockaddr_rxrpc *, - struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t); +void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ void rxrpc_data_ready(struct sock *); -int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); -void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); /* * insecure.c @@ -868,6 +858,7 @@ extern const char *rxrpc_acks(u8 reason); */ int rxrpc_send_call_packet(struct rxrpc_call *, u8); int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *); +void rxrpc_reject_packets(struct rxrpc_local *); /* * peer_event.c @@ -883,6 +874,8 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); +struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, + struct rxrpc_peer *); static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) { @@ -912,6 +905,7 @@ extern const struct file_operations rxrpc_connection_seq_fops; /* * recvmsg.c */ +void rxrpc_notify_socket(struct rxrpc_call *); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* @@ -961,6 +955,23 @@ static inline void rxrpc_sysctl_exit(void) {} */ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); +static inline bool before(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) < 0; +} +static inline bool before_eq(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) <= 0; +} +static inline bool after(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) > 0; +} +static inline bool after_eq(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) >= 0; +} + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index cc7194e05a15..b8acec0d596e 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -129,6 +129,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, set_bit(RXRPC_CALL_HAS_USERID, &call->flags); } + list_add(&call->sock_link, &rx->sock_calls); + write_unlock(&rx->call_lock); write_lock(&rxrpc_call_lock); @@ -186,6 +188,12 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) return; rx->backlog = NULL; + /* Make sure that there aren't any incoming calls in progress before we + * clear the preallocation buffers. + */ + spin_lock_bh(&rx->incoming_lock); + spin_unlock_bh(&rx->incoming_lock); + head = b->peer_backlog_head; tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { @@ -224,251 +232,179 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) } /* - * generate a connection-level abort + * Allocate a new incoming call from the prealloc pool, along with a connection + * and a peer as necessary. */ -static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, - struct rxrpc_wire_header *whdr) +static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_local *local, + struct rxrpc_connection *conn, + struct sk_buff *skb) { - struct msghdr msg; - struct kvec iov[1]; - size_t len; - int ret; - - _enter("%d,,", local->debug_id); - - whdr->type = RXRPC_PACKET_TYPE_BUSY; - whdr->serial = htonl(1); - - msg.msg_name = &srx->transport.sin; - msg.msg_namelen = sizeof(srx->transport.sin); - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - iov[0].iov_base = whdr; - iov[0].iov_len = sizeof(*whdr); - - len = iov[0].iov_len; - - _proto("Tx BUSY %%1"); + struct rxrpc_backlog *b = rx->backlog; + struct rxrpc_peer *peer, *xpeer; + struct rxrpc_call *call; + unsigned short call_head, conn_head, peer_head; + unsigned short call_tail, conn_tail, peer_tail; + unsigned short call_count, conn_count; + + /* #calls >= #conns >= #peers must hold true. */ + call_head = smp_load_acquire(&b->call_backlog_head); + call_tail = b->call_backlog_tail; + call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX); + conn_head = smp_load_acquire(&b->conn_backlog_head); + conn_tail = b->conn_backlog_tail; + conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX); + ASSERTCMP(conn_count, >=, call_count); + peer_head = smp_load_acquire(&b->peer_backlog_head); + peer_tail = b->peer_backlog_tail; + ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=, + conn_count); + + if (call_count == 0) + return NULL; + + if (!conn) { + /* No connection. We're going to need a peer to start off + * with. If one doesn't yet exist, use a spare from the + * preallocation set. We dump the address into the spare in + * anticipation - and to save on stack space. + */ + xpeer = b->peer_backlog[peer_tail]; + if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0) + return NULL; + + peer = rxrpc_lookup_incoming_peer(local, xpeer); + if (peer == xpeer) { + b->peer_backlog[peer_tail] = NULL; + smp_store_release(&b->peer_backlog_tail, + (peer_tail + 1) & + (RXRPC_BACKLOG_MAX - 1)); + } - ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); - if (ret < 0) { - _leave(" = -EAGAIN [sendmsg failed: %d]", ret); - return -EAGAIN; + /* Now allocate and set up the connection */ + conn = b->conn_backlog[conn_tail]; + b->conn_backlog[conn_tail] = NULL; + smp_store_release(&b->conn_backlog_tail, + (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + rxrpc_get_local(local); + conn->params.local = local; + conn->params.peer = peer; + rxrpc_new_incoming_connection(conn, skb); + } else { + rxrpc_get_connection(conn); } - _leave(" = 0"); - return 0; + /* And now we can allocate and set up a new call */ + call = b->call_backlog[call_tail]; + b->call_backlog[call_tail] = NULL; + smp_store_release(&b->call_backlog_tail, + (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + + call->conn = conn; + call->peer = rxrpc_get_peer(conn->params.peer); + return call; } /* - * accept an incoming call that needs peer, transport and/or connection setting - * up + * Set up a new incoming call. Called in BH context with the RCU read lock + * held. + * + * If this is for a kernel service, when we allocate the call, it will have + * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the + * retainer ref obtained from the backlog buffer. Prealloc calls for userspace + * services only have the ref from the backlog buffer. We want to pass this + * ref to non-BH context to dispose of. + * + * If we want to report an error, we mark the skb with the packet type and + * abort code and return NULL. */ -static int rxrpc_accept_incoming_call(struct rxrpc_local *local, - struct rxrpc_sock *rx, - struct sk_buff *skb, - struct sockaddr_rxrpc *srx) +struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_connection *conn, + struct sk_buff *skb) { - struct rxrpc_connection *conn; - struct rxrpc_skb_priv *sp, *nsp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; struct rxrpc_call *call; - struct sk_buff *notification; - int ret; _enter(""); - sp = rxrpc_skb(skb); - - /* get a notification message to send to the server app */ - notification = alloc_skb(0, GFP_NOFS); - if (!notification) { - _debug("no memory"); - ret = -ENOMEM; - goto error_nofree; - } - rxrpc_new_skb(notification); - notification->mark = RXRPC_SKB_MARK_NEW_CALL; - - conn = rxrpc_incoming_connection(local, srx, skb); - if (IS_ERR(conn)) { - _debug("no conn"); - ret = PTR_ERR(conn); - goto error; - } - - call = rxrpc_incoming_call(rx, conn, skb); - rxrpc_put_connection(conn); - if (IS_ERR(call)) { - _debug("no call"); - ret = PTR_ERR(call); - goto error; + /* Get the socket providing the service */ + hlist_for_each_entry_rcu_bh(rx, &local->services, listen_link) { + if (rx->srx.srx_service == sp->hdr.serviceId) + goto found_service; } - /* attach the call to the socket */ - read_lock_bh(&local->services_lock); - if (rx->sk.sk_state == RXRPC_CLOSE) - goto invalid_service; - - write_lock(&rx->call_lock); - if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { - rxrpc_get_call(call, rxrpc_call_got); - - spin_lock(&call->conn->state_lock); - if (sp->hdr.securityIndex > 0 && - call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) { - _debug("await conn sec"); - list_add_tail(&call->accept_link, &rx->secureq); - call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING; - set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); - rxrpc_queue_conn(call->conn); - } else { - _debug("conn ready"); - call->state = RXRPC_CALL_SERVER_ACCEPTING; - list_add_tail(&call->accept_link, &rx->acceptq); - rxrpc_get_call_for_skb(call, notification); - nsp = rxrpc_skb(notification); - nsp->call = call; - - ASSERTCMP(atomic_read(&call->usage), >=, 3); - - _debug("notify"); - spin_lock(&call->lock); - ret = rxrpc_queue_rcv_skb(call, notification, true, - false); - spin_unlock(&call->lock); - notification = NULL; - BUG_ON(ret < 0); - } - spin_unlock(&call->conn->state_lock); + trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->priority = RX_INVALID_OPERATION; + _leave(" = NULL [service]"); + return NULL; - _debug("queued"); +found_service: + spin_lock(&rx->incoming_lock); + if (rx->sk.sk_state == RXRPC_CLOSE) { + trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, + sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); + skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->priority = RX_INVALID_OPERATION; + _leave(" = NULL [close]"); + call = NULL; + goto out; } - write_unlock(&rx->call_lock); - _debug("process"); - rxrpc_fast_process_packet(call, skb); - - _debug("done"); - read_unlock_bh(&local->services_lock); - rxrpc_free_skb(notification); - rxrpc_put_call(call, rxrpc_call_put); - _leave(" = 0"); - return 0; - -invalid_service: - _debug("invalid"); - read_unlock_bh(&local->services_lock); - - rxrpc_release_call(rx, call); - rxrpc_put_call(call, rxrpc_call_put); - ret = -ECONNREFUSED; -error: - rxrpc_free_skb(notification); -error_nofree: - _leave(" = %d", ret); - return ret; -} + call = rxrpc_alloc_incoming_call(rx, local, conn, skb); + if (!call) { + skb->mark = RXRPC_SKB_MARK_BUSY; + _leave(" = NULL [busy]"); + call = NULL; + goto out; + } -/* - * accept incoming calls that need peer, transport and/or connection setting up - * - the packets we get are all incoming client DATA packets that have seq == 1 - */ -void rxrpc_accept_incoming_calls(struct rxrpc_local *local) -{ - struct rxrpc_skb_priv *sp; - struct sockaddr_rxrpc srx; - struct rxrpc_sock *rx; - struct rxrpc_wire_header whdr; - struct sk_buff *skb; - int ret; + /* Make the call live. */ + rxrpc_incoming_call(rx, call, skb); + conn = call->conn; - _enter("%d", local->debug_id); + if (rx->notify_new_call) + rx->notify_new_call(&rx->sk, call, call->user_call_ID); - skb = skb_dequeue(&local->accept_queue); - if (!skb) { - _leave("\n"); - return; - } + spin_lock(&conn->state_lock); + switch (conn->state) { + case RXRPC_CONN_SERVICE_UNSECURED: + conn->state = RXRPC_CONN_SERVICE_CHALLENGING; + set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); + rxrpc_queue_conn(call->conn); + break; - _net("incoming call skb %p", skb); - - rxrpc_see_skb(skb); - sp = rxrpc_skb(skb); - - /* Set up a response packet header in case we need it */ - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = 0; - whdr.flags = 0; - whdr.type = 0; - whdr.userStatus = 0; - whdr.securityIndex = sp->hdr.securityIndex; - whdr._rsvd = 0; - whdr.serviceId = htons(sp->hdr.serviceId); - - if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) - goto drop; - - /* get the socket providing the service */ - read_lock_bh(&local->services_lock); - hlist_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == sp->hdr.serviceId && - rx->sk.sk_state != RXRPC_CLOSE) - goto found_service; - } - read_unlock_bh(&local->services_lock); - goto invalid_service; + case RXRPC_CONN_SERVICE: + write_lock(&call->state_lock); + if (rx->discard_new_call) + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + else + call->state = RXRPC_CALL_SERVER_ACCEPTING; + write_unlock(&call->state_lock); + break; -found_service: - _debug("found service %hd", rx->srx.srx_service); - if (sk_acceptq_is_full(&rx->sk)) - goto backlog_full; - sk_acceptq_added(&rx->sk); - read_unlock_bh(&local->services_lock); - - ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); - if (ret < 0) - sk_acceptq_removed(&rx->sk); - switch (ret) { - case -ECONNRESET: /* old calls are ignored */ - case -ECONNABORTED: /* aborted calls are reaborted or ignored */ - case 0: - return; - case -ECONNREFUSED: - goto invalid_service; - case -EBUSY: - goto busy; - case -EKEYREJECTED: - goto security_mismatch; + case RXRPC_CONN_REMOTELY_ABORTED: + rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + conn->remote_abort, ECONNABORTED); + break; + case RXRPC_CONN_LOCALLY_ABORTED: + rxrpc_abort_call("CON", call, sp->hdr.seq, + conn->local_abort, ECONNABORTED); + break; default: BUG(); } + spin_unlock(&conn->state_lock); -backlog_full: - read_unlock_bh(&local->services_lock); -busy: - rxrpc_busy(local, &srx, &whdr); - rxrpc_free_skb(skb); - return; - -drop: - rxrpc_free_skb(skb); - return; + if (call->state == RXRPC_CALL_SERVER_ACCEPTING) + rxrpc_notify_socket(call); -invalid_service: - skb->priority = RX_INVALID_OPERATION; - rxrpc_reject_packet(local, skb); - return; - - /* can't change connection security type mid-flow */ -security_mismatch: - skb->priority = RX_PROTOCOL_ERROR; - rxrpc_reject_packet(local, skb); - return; + _leave(" = %p{%d}", call, call->debug_id); +out: + spin_unlock(&rx->incoming_lock); + return call; } /* @@ -490,11 +426,10 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, write_lock(&rx->call_lock); ret = -ENODATA; - if (list_empty(&rx->acceptq)) + if (list_empty(&rx->to_be_accepted)) goto out; /* check the user ID isn't already in use */ - ret = -EBADSLT; pp = &rx->calls.rb_node; parent = NULL; while (*pp) { @@ -506,11 +441,14 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, else if (user_call_ID > call->user_call_ID) pp = &(*pp)->rb_right; else - goto out; + goto id_in_use; } - /* dequeue the first call and check it's still valid */ - call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + /* Dequeue the first call and check it's still valid. We gain + * responsibility for the queue's reference. + */ + call = list_entry(rx->to_be_accepted.next, + struct rxrpc_call, accept_link); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); rxrpc_see_call(call); @@ -528,31 +466,35 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, } /* formalise the acceptance */ - rxrpc_get_call(call, rxrpc_call_got_userid); + rxrpc_get_call(call, rxrpc_call_got); call->notify_rx = notify_rx; call->user_call_ID = user_call_ID; + rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) BUG(); - if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) - BUG(); write_unlock_bh(&call->state_lock); write_unlock(&rx->call_lock); - rxrpc_queue_call(call); + rxrpc_notify_socket(call); + rxrpc_service_prealloc(rx, GFP_KERNEL); _leave(" = %p{%d}", call, call->debug_id); return call; out_release: + _debug("release %p", call); write_unlock_bh(&call->state_lock); write_unlock(&rx->call_lock); - _debug("release %p", call); rxrpc_release_call(rx, call); - _leave(" = %d", ret); - return ERR_PTR(ret); -out: + rxrpc_put_call(call, rxrpc_call_put); + goto out; + +id_in_use: + ret = -EBADSLT; write_unlock(&rx->call_lock); +out: + rxrpc_service_prealloc(rx, GFP_KERNEL); _leave(" = %d", ret); return ERR_PTR(ret); } @@ -564,6 +506,7 @@ out: int rxrpc_reject_call(struct rxrpc_sock *rx) { struct rxrpc_call *call; + bool abort = false; int ret; _enter(""); @@ -572,15 +515,16 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) write_lock(&rx->call_lock); - ret = -ENODATA; - if (list_empty(&rx->acceptq)) { + if (list_empty(&rx->to_be_accepted)) { write_unlock(&rx->call_lock); - _leave(" = -ENODATA"); return -ENODATA; } - /* dequeue the first call and check it's still valid */ - call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + /* Dequeue the first call and check it's still valid. We gain + * responsibility for the queue's reference. + */ + call = list_entry(rx->to_be_accepted.next, + struct rxrpc_call, accept_link); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); rxrpc_see_call(call); @@ -588,66 +532,28 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: - __rxrpc_set_call_completion(call, RXRPC_CALL_SERVER_BUSY, - 0, ECONNABORTED); - if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) - rxrpc_queue_call(call); - ret = 0; - break; + __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED); + abort = true; + /* fall through */ case RXRPC_CALL_COMPLETE: ret = call->error; - break; + goto out_discard; default: BUG(); } +out_discard: write_unlock_bh(&call->state_lock); write_unlock(&rx->call_lock); - rxrpc_release_call(rx, call); - _leave(" = %d", ret); - return ret; -} - -/** - * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call - * @sock: The socket on which the impending call is waiting - * @user_call_ID: The tag to attach to the call - * @notify_rx: Where to send notifications instead of socket queue - * - * Allow a kernel service to accept an incoming call, assuming the incoming - * call is still valid. The caller should immediately trigger their own - * notification as there must be data waiting. - */ -struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, - unsigned long user_call_ID, - rxrpc_notify_rx_t notify_rx) -{ - struct rxrpc_call *call; - - _enter(",%lx", user_call_ID); - call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID, notify_rx); - _leave(" = %p", call); - return call; -} -EXPORT_SYMBOL(rxrpc_kernel_accept_call); - -/** - * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call - * @sock: The socket on which the impending call is waiting - * - * Allow a kernel service to reject an incoming call with a BUSY message, - * assuming the incoming call is still valid. - */ -int rxrpc_kernel_reject_call(struct socket *sock) -{ - int ret; - - _enter(""); - ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); + if (abort) { + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + } + rxrpc_service_prealloc(rx, GFP_KERNEL); _leave(" = %d", ret); return ret; } -EXPORT_SYMBOL(rxrpc_kernel_reject_call); /* * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index af88ad7d2cf9..2b976e789562 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -22,1257 +22,286 @@ #include "ar-internal.h" /* - * propose an ACK be sent + * Set the timer */ -void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u16 skew, u32 serial, bool immediate) +static void rxrpc_set_timer(struct rxrpc_call *call) { - unsigned long expiry; - s8 prior = rxrpc_ack_priority[ack_reason]; - - ASSERTCMP(prior, >, 0); - - _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), serial, immediate); + unsigned long t, now = jiffies; - if (prior < rxrpc_ack_priority[call->ackr_reason]) { - if (immediate) - goto cancel_timer; - return; - } - - /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial - * numbers */ - if (prior == rxrpc_ack_priority[call->ackr_reason]) { - if (prior <= 4) { - call->ackr_skew = skew; - call->ackr_serial = serial; - } - if (immediate) - goto cancel_timer; - return; - } - - call->ackr_reason = ack_reason; - call->ackr_serial = serial; - - switch (ack_reason) { - case RXRPC_ACK_DELAY: - _debug("run delay timer"); - expiry = rxrpc_soft_ack_delay; - goto run_timer; - - case RXRPC_ACK_IDLE: - if (!immediate) { - _debug("run defer timer"); - expiry = rxrpc_idle_ack_delay; - goto run_timer; - } - goto cancel_timer; + _enter("{%ld,%ld,%ld:%ld}", + call->ack_at - now, call->resend_at - now, call->expire_at - now, + call->timer.expires - now); + + read_lock_bh(&call->state_lock); - case RXRPC_ACK_REQUESTED: - expiry = rxrpc_requested_ack_delay; - if (!expiry) - goto cancel_timer; - if (!immediate || serial == 1) { - _debug("run defer timer"); - goto run_timer; + if (call->state < RXRPC_CALL_COMPLETE) { + t = call->ack_at; + if (time_before(call->resend_at, t)) + t = call->resend_at; + if (time_before(call->expire_at, t)) + t = call->expire_at; + if (!timer_pending(&call->timer) || + time_before(t, call->timer.expires)) { + _debug("set timer %ld", t - now); + mod_timer(&call->timer, t); } - - default: - _debug("immediate ACK"); - goto cancel_timer; } - -run_timer: - expiry += jiffies; - if (!timer_pending(&call->ack_timer) || - time_after(call->ack_timer.expires, expiry)) - mod_timer(&call->ack_timer, expiry); - return; - -cancel_timer: - _debug("cancel timer %%%u", serial); - try_to_del_timer_sync(&call->ack_timer); - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - rxrpc_queue_call(call); read_unlock_bh(&call->state_lock); } /* - * propose an ACK be sent, locking the call structure + * propose an ACK be sent */ -void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u16 skew, u32 serial, bool immediate) +static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u16 skew, u32 serial, bool immediate, + bool background) { + unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; - if (prior > rxrpc_ack_priority[call->ackr_reason]) { - spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, skew, serial, immediate); - spin_unlock_bh(&call->lock); - } -} - -/* - * set the resend timer - */ -static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, - unsigned long resend_at) -{ - read_lock_bh(&call->state_lock); - if (call->state == RXRPC_CALL_COMPLETE) - resend = 0; - - if (resend & 1) { - _debug("SET RESEND"); - set_bit(RXRPC_CALL_EV_RESEND, &call->events); - } - - if (resend & 2) { - _debug("MODIFY RESEND TIMER"); - set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - mod_timer(&call->resend_timer, resend_at); - } else { - _debug("KILL RESEND TIMER"); - del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } - read_unlock_bh(&call->state_lock); -} - -/* - * resend packets - */ -static void rxrpc_resend(struct rxrpc_call *call) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - bool stop; - int loop; - u8 resend; - - _enter("{%d,%d,%d,%d},", - call->acks_hard, call->acks_unacked, - atomic_read(&call->sequence), - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - - stop = false; - resend = 0; - resend_at = 0; - - for (loop = call->acks_tail; - loop != call->acks_head || stop; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - if (*p_txb & 1) - continue; - - txb = (struct sk_buff *) *p_txb; - sp = rxrpc_skb(txb); - - if (sp->need_resend) { - sp->need_resend = false; - - /* each Tx packet has a new serial number */ - sp->hdr.serial = atomic_inc_return(&call->conn->serial); - - whdr = (struct rxrpc_wire_header *)txb->head; - whdr->serial = htonl(sp->hdr.serial); - - _proto("Tx DATA %%%u { #%d }", - sp->hdr.serial, sp->hdr.seq); - if (rxrpc_send_data_packet(call->conn, txb) < 0) { - stop = true; - sp->resend_at = jiffies + 3; - } else { - if (rxrpc_is_client_call(call)) - rxrpc_expose_client_call(call); - sp->resend_at = - jiffies + rxrpc_resend_timeout; - } - } - - if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; - } - } - - rxrpc_set_resend(call, resend, resend_at); - _leave(""); -} - -/* - * handle resend timer expiry - */ -static void rxrpc_resend_timer(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - int loop; - u8 resend; - - _enter("%d,%d,%d", - call->acks_tail, call->acks_unacked, call->acks_head); - - if (call->state == RXRPC_CALL_COMPLETE) - return; - - resend = 0; - resend_at = 0; - - for (loop = call->acks_unacked; - loop != call->acks_head; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); - - ASSERT(!(*p_txb & 1)); + _enter("{%d},%s,%%%x,%u", + call->debug_id, rxrpc_acks(ack_reason), serial, immediate); - if (sp->need_resend) { - ; - } else if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; + /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial + * numbers, but we don't alter the timeout. + */ + _debug("prior %u %u vs %u %u", + ack_reason, prior, + call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]); + if (ack_reason == call->ackr_reason) { + if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) { + call->ackr_serial = serial; + call->ackr_skew = skew; } + if (!immediate) + return; + } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { + call->ackr_reason = ack_reason; + call->ackr_serial = serial; + call->ackr_skew = skew; } - rxrpc_set_resend(call, resend, resend_at); - _leave(""); -} - -/* - * process soft ACKs of our transmitted packets - * - these indicate packets the peer has or has not received, but hasn't yet - * given to the consumer, and so can still be discarded and re-requested - */ -static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, - struct rxrpc_ackpacket *ack, - struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - int loop; - u8 sacks[RXRPC_MAXACKS], resend; - - _enter("{%d,%d},{%d},", - call->acks_hard, - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), - ack->nAcks); + switch (ack_reason) { + case RXRPC_ACK_REQUESTED: + if (rxrpc_requested_ack_delay < expiry) + expiry = rxrpc_requested_ack_delay; + if (serial == 1) + immediate = false; + break; - if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) - goto protocol_error; + case RXRPC_ACK_DELAY: + if (rxrpc_soft_ack_delay < expiry) + expiry = rxrpc_soft_ack_delay; + break; - resend = 0; - resend_at = 0; - for (loop = 0; loop < ack->nAcks; loop++) { - p_txb = call->acks_window; - p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); + case RXRPC_ACK_IDLE: + if (rxrpc_soft_ack_delay < expiry) + expiry = rxrpc_idle_ack_delay; + break; - switch (sacks[loop]) { - case RXRPC_ACK_TYPE_ACK: - sp->need_resend = false; - *p_txb |= 1; - break; - case RXRPC_ACK_TYPE_NACK: - sp->need_resend = true; - *p_txb &= ~1; - resend = 1; - break; - default: - _debug("Unsupported ACK type %d", sacks[loop]); - goto protocol_error; - } + default: + immediate = true; + break; } - smp_mb(); - call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); - - /* anything not explicitly ACK'd is implicitly NACK'd, but may just not - * have been received or processed yet by the far end */ - for (loop = call->acks_unacked; - loop != call->acks_head; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); - - if (*p_txb & 1) { - /* packet must have been discarded */ - sp->need_resend = true; - *p_txb &= ~1; - resend |= 1; - } else if (sp->need_resend) { - ; - } else if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; + now = jiffies; + if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { + _debug("already scheduled"); + } else if (immediate || expiry == 0) { + _debug("immediate ACK %lx", call->events); + if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) && + background) + rxrpc_queue_call(call); + } else { + ack_at = now + expiry; + _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now); + if (time_before(ack_at, call->ack_at)) { + call->ack_at = ack_at; + rxrpc_set_timer(call); } } - - rxrpc_set_resend(call, resend, resend_at); - _leave(" = 0"); - return 0; - -protocol_error: - _leave(" = -EPROTO"); - return -EPROTO; } /* - * discard hard-ACK'd packets from the Tx window - */ -static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) -{ - unsigned long _skb; - int tail = call->acks_tail, old_tail; - int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); - - _enter("{%u,%u},%u", call->acks_hard, win, hard); - - ASSERTCMP(hard - call->acks_hard, <=, win); - - while (call->acks_hard < hard) { - smp_read_barrier_depends(); - _skb = call->acks_window[tail] & ~1; - rxrpc_free_skb((struct sk_buff *) _skb); - old_tail = tail; - tail = (tail + 1) & (call->acks_winsz - 1); - call->acks_tail = tail; - if (call->acks_unacked == old_tail) - call->acks_unacked = tail; - call->acks_hard++; - } - - wake_up(&call->waitq); -} - -/* - * clear the Tx window in the event of a failure + * propose an ACK be sent, locking the call structure */ -static void rxrpc_clear_tx_window(struct rxrpc_call *call) +void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u16 skew, u32 serial, bool immediate, bool background) { - rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); + spin_lock_bh(&call->lock); + __rxrpc_propose_ACK(call, ack_reason, skew, serial, + immediate, background); + spin_unlock_bh(&call->lock); } /* - * drain the out of sequence received packet queue into the packet Rx queue + * Perform retransmission of NAK'd and unack'd packets. */ -static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) +static void rxrpc_resend(struct rxrpc_call *call) { + struct rxrpc_wire_header *whdr; struct rxrpc_skb_priv *sp; struct sk_buff *skb; - bool terminal; - int ret; + rxrpc_seq_t cursor, seq, top; + unsigned long resend_at, now; + int ix; + u8 annotation; - _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); + _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); spin_lock_bh(&call->lock); - ret = -ECONNRESET; - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) - goto socket_unavailable; + cursor = call->tx_hard_ack; + top = call->tx_top; + ASSERT(before_eq(cursor, top)); + if (cursor == top) + goto out_unlock; + + /* Scan the packet list without dropping the lock and decide which of + * the packets in the Tx buffer we're going to resend and what the new + * resend timeout will be. + */ + now = jiffies; + resend_at = now + rxrpc_resend_timeout; + seq = cursor + 1; + do { + ix = seq & RXRPC_RXTX_BUFF_MASK; + annotation = call->rxtx_annotations[ix]; + if (annotation == RXRPC_TX_ANNO_ACK) + continue; - skb = skb_dequeue(&call->rx_oos_queue); - if (skb) { + skb = call->rxtx_buffer[ix]; rxrpc_see_skb(skb); sp = rxrpc_skb(skb); - _debug("drain OOS packet %d [%d]", - sp->hdr.seq, call->rx_first_oos); - - if (sp->hdr.seq != call->rx_first_oos) { - skb_queue_head(&call->rx_oos_queue, skb); - call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; - _debug("requeue %p {%u}", skb, call->rx_first_oos); - } else { - skb->mark = RXRPC_SKB_MARK_DATA; - terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && - !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); - ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); - BUG_ON(ret < 0); - _debug("drain #%u", call->rx_data_post); - call->rx_data_post++; - - /* find out what the next packet is */ - skb = skb_peek(&call->rx_oos_queue); - rxrpc_see_skb(skb); - if (skb) - call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; - else - call->rx_first_oos = 0; - _debug("peek %p {%u}", skb, call->rx_first_oos); - } - } - - ret = 0; -socket_unavailable: - spin_unlock_bh(&call->lock); - _leave(" = %d", ret); - return ret; -} - -/* - * insert an out of sequence packet into the buffer - */ -static void rxrpc_insert_oos_packet(struct rxrpc_call *call, - struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp, *psp; - struct sk_buff *p; - u32 seq; - - sp = rxrpc_skb(skb); - seq = sp->hdr.seq; - _enter(",,{%u}", seq); - - skb->destructor = rxrpc_packet_destructor; - ASSERTCMP(sp->call, ==, NULL); - sp->call = call; - rxrpc_get_call_for_skb(call, skb); - - /* insert into the buffer in sequence order */ - spin_lock_bh(&call->lock); - - skb_queue_walk(&call->rx_oos_queue, p) { - psp = rxrpc_skb(p); - if (psp->hdr.seq > seq) { - _debug("insert oos #%u before #%u", seq, psp->hdr.seq); - skb_insert(p, skb, &call->rx_oos_queue); - goto inserted; - } - } - - _debug("append oos #%u", seq); - skb_queue_tail(&call->rx_oos_queue, skb); -inserted: - - /* we might now have a new front to the queue */ - if (call->rx_first_oos == 0 || seq < call->rx_first_oos) - call->rx_first_oos = seq; - - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - call->rx_data_post == call->rx_first_oos) { - _debug("drain rx oos now"); - set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); - } - read_unlock(&call->state_lock); - - spin_unlock_bh(&call->lock); - _leave(" [stored #%u]", call->rx_first_oos); -} - -/* - * clear the Tx window on final ACK reception - */ -static void rxrpc_zap_tx_window(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - unsigned long _skb, *acks_window; - u8 winsz = call->acks_winsz; - int tail; - - acks_window = call->acks_window; - call->acks_window = NULL; - - while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { - tail = call->acks_tail; - smp_read_barrier_depends(); - _skb = acks_window[tail] & ~1; - smp_mb(); - call->acks_tail = (call->acks_tail + 1) & (winsz - 1); - - skb = (struct sk_buff *) _skb; - sp = rxrpc_skb(skb); - _debug("+++ clear Tx %u", sp->hdr.seq); - rxrpc_free_skb(skb); - } - - kfree(acks_window); -} - -/* - * process the extra information that may be appended to an ACK packet - */ -static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int latest, int nAcks) -{ - struct rxrpc_ackinfo ackinfo; - struct rxrpc_peer *peer; - unsigned int mtu; - - if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { - _leave(" [no ackinfo]"); - return; - } - - _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", - latest, - ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), - ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); - - mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); - - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock_bh(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); - } -} - -/* - * process packets in the reception queue - */ -static int rxrpc_process_rx_queue(struct rxrpc_call *call, - u32 *_abort_code) -{ - struct rxrpc_ackpacket ack; - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - bool post_ACK; - int latest; - u32 hard, tx; - - _enter(""); - -process_further: - skb = skb_dequeue(&call->rx_queue); - if (!skb) - return -EAGAIN; - - rxrpc_see_skb(skb); - _net("deferred skb %p", skb); - - sp = rxrpc_skb(skb); - - _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); - - post_ACK = false; - - switch (sp->hdr.type) { - /* data packets that wind up here have been received out of - * order, need security processing or are jumbo packets */ - case RXRPC_PACKET_TYPE_DATA: - _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - - /* secured packets must be verified and possibly decrypted */ - if (call->conn->security->verify_packet(call, skb, - sp->hdr.seq, - sp->hdr.cksum) < 0) - goto protocol_error; - - rxrpc_insert_oos_packet(call, skb); - goto process_further; - - /* partial ACK to process */ - case RXRPC_PACKET_TYPE_ACK: - if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { - _debug("extraction failure"); - goto protocol_error; - } - if (!skb_pull(skb, sizeof(ack))) - BUG(); - - latest = sp->hdr.serial; - hard = ntohl(ack.firstPacket); - tx = atomic_read(&call->sequence); - - _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - latest, - ntohs(ack.maxSkew), - hard, - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks(ack.reason), - ack.nAcks); - - rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); - - if (ack.reason == RXRPC_ACK_PING) { - _proto("Rx ACK %%%u PING Request", latest); - rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - skb->priority, sp->hdr.serial, true); - } - - /* discard any out-of-order or duplicate ACKs */ - if (latest - call->acks_latest <= 0) { - _debug("discard ACK %d <= %d", - latest, call->acks_latest); - goto discard; - } - call->acks_latest = latest; - - if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && - call->state != RXRPC_CALL_SERVER_SEND_REPLY && - call->state != RXRPC_CALL_SERVER_AWAIT_ACK) - goto discard; - - _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); - - if (hard > 0) { - if (hard - 1 > tx) { - _debug("hard-ACK'd packet %d not transmitted" - " (%d top)", - hard - 1, tx); - goto protocol_error; - } - - if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || - call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && - hard > tx) { - call->acks_hard = tx; - goto all_acked; + if (annotation == RXRPC_TX_ANNO_UNACK) { + if (time_after(sp->resend_at, now)) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + continue; } - - smp_rmb(); - rxrpc_rotate_tx_window(call, hard - 1); - } - - if (ack.nAcks > 0) { - if (hard - 1 + ack.nAcks > tx) { - _debug("soft-ACK'd packet %d+%d not" - " transmitted (%d top)", - hard - 1, ack.nAcks, tx); - goto protocol_error; - } - - if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) - goto protocol_error; } - goto discard; - /* complete ACK to process */ - case RXRPC_PACKET_TYPE_ACKALL: - goto all_acked; - - /* abort and busy are handled elsewhere */ - case RXRPC_PACKET_TYPE_BUSY: - case RXRPC_PACKET_TYPE_ABORT: - BUG(); - - /* connection level events - also handled elsewhere */ - case RXRPC_PACKET_TYPE_CHALLENGE: - case RXRPC_PACKET_TYPE_RESPONSE: - case RXRPC_PACKET_TYPE_DEBUG: - BUG(); - } - - /* if we've had a hard ACK that covers all the packets we've sent, then - * that ends that phase of the operation */ -all_acked: - write_lock_bh(&call->state_lock); - _debug("ack all %d", call->state); - - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; - break; - case RXRPC_CALL_SERVER_AWAIT_ACK: - _debug("srv complete"); - __rxrpc_call_completed(call); - post_ACK = true; - break; - case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_SERVER_RECV_REQUEST: - goto protocol_error_unlock; /* can't occur yet */ - default: - write_unlock_bh(&call->state_lock); - goto discard; /* assume packet left over from earlier phase */ - } - - write_unlock_bh(&call->state_lock); - - /* if all the packets we sent are hard-ACK'd, then we can discard - * whatever we've got left */ - _debug("clear Tx %d", - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - - del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - - if (call->acks_window) - rxrpc_zap_tx_window(call); + /* Okay, we need to retransmit a packet. */ + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; + seq++; + } while (before_eq(seq, top)); + + call->resend_at = resend_at; + + /* Now go through the Tx window and perform the retransmissions. We + * have to drop the lock for each send. If an ACK comes in whilst the + * lock is dropped, it may clear some of the retransmission markers for + * packets that it soft-ACKs. + */ + seq = cursor + 1; + do { + ix = seq & RXRPC_RXTX_BUFF_MASK; + annotation = call->rxtx_annotations[ix]; + if (annotation != RXRPC_TX_ANNO_RETRANS) + continue; - if (post_ACK) { - /* post the final ACK message for userspace to pick up */ - _debug("post ACK"); - skb->mark = RXRPC_SKB_MARK_FINAL_ACK; - sp->call = call; - rxrpc_get_call_for_skb(call, skb); - spin_lock_bh(&call->lock); - if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) - BUG(); + skb = call->rxtx_buffer[ix]; + rxrpc_get_skb(skb); spin_unlock_bh(&call->lock); - goto process_further; - } - -discard: - rxrpc_free_skb(skb); - goto process_further; - -protocol_error_unlock: - write_unlock_bh(&call->state_lock); -protocol_error: - rxrpc_free_skb(skb); - _leave(" = -EPROTO"); - return -EPROTO; -} - -/* - * post a message to the socket Rx queue for recvmsg() to pick up - */ -static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, - bool fatal) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - int ret; - - _enter("{%d,%lx},%u,%u,%d", - call->debug_id, call->flags, mark, error, fatal); - - /* remove timers and things for fatal messages */ - if (fatal) { - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } + sp = rxrpc_skb(skb); - if (mark != RXRPC_SKB_MARK_NEW_CALL && - !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - _leave("[no userid]"); - return 0; - } + /* Each Tx packet needs a new serial number */ + sp->hdr.serial = atomic_inc_return(&call->conn->serial); - if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { - skb = alloc_skb(0, GFP_NOFS); - if (!skb) - return -ENOMEM; + whdr = (struct rxrpc_wire_header *)skb->head; + whdr->serial = htonl(sp->hdr.serial); - rxrpc_new_skb(skb); + if (rxrpc_send_data_packet(call->conn, skb) < 0) { + call->resend_at = now + 2; + rxrpc_free_skb(skb); + return; + } - skb->mark = mark; - - sp = rxrpc_skb(skb); - memset(sp, 0, sizeof(*sp)); - sp->error = error; - sp->call = call; - rxrpc_get_call_for_skb(call, skb); + if (rxrpc_is_client_call(call)) + rxrpc_expose_client_call(call); + sp->resend_at = now + rxrpc_resend_timeout; + rxrpc_free_skb(skb); spin_lock_bh(&call->lock); - ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); - spin_unlock_bh(&call->lock); - BUG_ON(ret < 0); - } - return 0; + /* We need to clear the retransmit state, but there are two + * things we need to be aware of: A new ACK/NAK might have been + * received and the packet might have been hard-ACK'd (in which + * case it will no longer be in the buffer). + */ + if (after(seq, call->tx_hard_ack) && + (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_RETRANS || + call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK)) + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; + + if (after(call->tx_hard_ack, seq)) + seq = call->tx_hard_ack; + seq++; + } while (before_eq(seq, top)); + +out_unlock: + spin_unlock_bh(&call->lock); + _leave(""); } /* - * Handle background processing of incoming call packets and ACK / abort - * generation. A ref on the call is donated to us by whoever queued the work - * item. + * Handle retransmission and deferred ACK/abort generation. */ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - struct rxrpc_wire_header whdr; - struct rxrpc_ackpacket ack; - struct rxrpc_ackinfo ackinfo; - struct msghdr msg; - struct kvec iov[5]; - enum rxrpc_call_event genbit; - unsigned long bits; - __be32 data, pad; - size_t len; - bool requeue = false; - int loop, nbit, ioc, ret, mtu; - u32 serial, abort_code = RX_PROTOCOL_ERROR; - u8 *acks = NULL; + unsigned long now; rxrpc_see_call(call); //printk("\n--------------------\n"); - _enter("{%d,%s,%lx} [%lu]", - call->debug_id, rxrpc_call_states[call->state], call->events, - (jiffies - call->creation_jif) / (HZ / 10)); - - if (call->state >= RXRPC_CALL_COMPLETE) { - rxrpc_put_call(call, rxrpc_call_put); - return; - } - - if (!call->conn) - goto skip_msg_init; - - /* there's a good chance we're going to have to send a message, so set - * one up in advance */ - msg.msg_name = &call->peer->srx.transport; - msg.msg_namelen = call->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; + _enter("{%d,%s,%lx}", + call->debug_id, rxrpc_call_states[call->state], call->events); - whdr.epoch = htonl(call->conn->proto.epoch); - whdr.cid = htonl(call->cid); - whdr.callNumber = htonl(call->call_id); - whdr.seq = 0; - whdr.type = RXRPC_PACKET_TYPE_ACK; - whdr.flags = call->conn->out_clientflag; - whdr.userStatus = 0; - whdr.securityIndex = call->conn->security_ix; - whdr._rsvd = 0; - whdr.serviceId = htons(call->service_id); - - memset(iov, 0, sizeof(iov)); - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); -skip_msg_init: - - /* deal with events of a final nature */ - if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { - enum rxrpc_skb_mark mark; - - clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_ABORT, &call->events); - - if (call->completion == RXRPC_CALL_NETWORK_ERROR) { - mark = RXRPC_SKB_MARK_NET_ERROR; - _debug("post net error %d", call->error); - } else { - mark = RXRPC_SKB_MARK_LOCAL_ERROR; - _debug("post net local error %d", call->error); - } - - if (rxrpc_post_message(call, mark, call->error, true) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - goto kill_ACKs; - } - - if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - - clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_ABORT, &call->events); - - _debug("post conn abort"); - - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - call->error, true) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - goto kill_ACKs; - } - - if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { - whdr.type = RXRPC_PACKET_TYPE_BUSY; - genbit = RXRPC_CALL_EV_REJECT_BUSY; - goto send_message; - } - - if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - call->error, true) < 0) - goto no_mem; - whdr.type = RXRPC_PACKET_TYPE_ABORT; - data = htonl(call->abort_code); - iov[1].iov_base = &data; - iov[1].iov_len = sizeof(data); - genbit = RXRPC_CALL_EV_ABORT; - goto send_message; - } - - if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { - genbit = RXRPC_CALL_EV_ACK_FINAL; - - ack.bufferSpace = htons(8); - ack.maxSkew = 0; - ack.serial = 0; - ack.reason = RXRPC_ACK_IDLE; - ack.nAcks = 0; - call->ackr_reason = 0; - - spin_lock_bh(&call->lock); - ack.serial = htonl(call->ackr_serial); - ack.previousPacket = htonl(call->ackr_prev_seq); - ack.firstPacket = htonl(call->rx_data_eaten + 1); - spin_unlock_bh(&call->lock); - - pad = 0; - - iov[1].iov_base = &ack; - iov[1].iov_len = sizeof(ack); - iov[2].iov_base = &pad; - iov[2].iov_len = 3; - iov[3].iov_base = &ackinfo; - iov[3].iov_len = sizeof(ackinfo); - goto send_ACK; +recheck_state: + if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + goto recheck_state; } - if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | - (1 << RXRPC_CALL_EV_RCVD_ABORT)) - ) { - u32 mark; - - if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) - mark = RXRPC_SKB_MARK_REMOTE_ABORT; - else - mark = RXRPC_SKB_MARK_BUSY; - - _debug("post abort/busy"); - rxrpc_clear_tx_window(call); - if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) - goto no_mem; - - clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - goto kill_ACKs; + if (call->state == RXRPC_CALL_COMPLETE) { + del_timer_sync(&call->timer); + goto out_put; } - if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { - _debug("do implicit ackall"); - rxrpc_clear_tx_window(call); - } - - if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { + now = jiffies; + if (time_after_eq(now, call->expire_at)) { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); - - _debug("post timeout"); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - ETIME, true) < 0) - goto no_mem; - - clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); - goto kill_ACKs; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); } - /* deal with assorted inbound messages */ - if (!skb_queue_empty(&call->rx_queue)) { - ret = rxrpc_process_rx_queue(call, &abort_code); - switch (ret) { - case 0: - case -EAGAIN: - break; - case -ENOMEM: - goto no_mem; - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EPROTO: - rxrpc_abort_call("PRO", call, 0, abort_code, -ret); - goto kill_ACKs; + if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || + time_after_eq(now, call->ack_at)) { + call->ack_at = call->expire_at; + if (call->ackr_reason) { + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + goto recheck_state; } } - /* handle resending */ - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_resend_timer(call); - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) || + time_after_eq(now, call->resend_at)) { rxrpc_resend(call); - - /* consider sending an ordinary ACK */ - if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { - _debug("send ACK: window: %d - %d { %lx }", - call->rx_data_eaten, call->ackr_win_top, - call->ackr_window[0]); - - if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && - call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { - /* ACK by sending reply DATA packet in this state */ - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - goto maybe_reschedule; - } - - genbit = RXRPC_CALL_EV_ACK; - - acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, - GFP_NOFS); - if (!acks) - goto no_mem; - - //hdr.flags = RXRPC_SLOW_START_OK; - ack.bufferSpace = htons(8); - ack.maxSkew = 0; - - spin_lock_bh(&call->lock); - ack.reason = call->ackr_reason; - ack.serial = htonl(call->ackr_serial); - ack.previousPacket = htonl(call->ackr_prev_seq); - ack.firstPacket = htonl(call->rx_data_eaten + 1); - - ack.nAcks = 0; - for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { - nbit = loop * BITS_PER_LONG; - for (bits = call->ackr_window[loop]; bits; bits >>= 1 - ) { - _debug("- l=%d n=%d b=%lx", loop, nbit, bits); - if (bits & 1) { - acks[nbit] = RXRPC_ACK_TYPE_ACK; - ack.nAcks = nbit + 1; - } - nbit++; - } - } - call->ackr_reason = 0; - spin_unlock_bh(&call->lock); - - pad = 0; - - iov[1].iov_base = &ack; - iov[1].iov_len = sizeof(ack); - iov[2].iov_base = acks; - iov[2].iov_len = ack.nAcks; - iov[3].iov_base = &pad; - iov[3].iov_len = 3; - iov[4].iov_base = &ackinfo; - iov[4].iov_len = sizeof(ackinfo); - - switch (ack.reason) { - case RXRPC_ACK_REQUESTED: - case RXRPC_ACK_DUPLICATE: - case RXRPC_ACK_OUT_OF_SEQUENCE: - case RXRPC_ACK_EXCEEDS_WINDOW: - case RXRPC_ACK_NOSPACE: - case RXRPC_ACK_PING: - case RXRPC_ACK_PING_RESPONSE: - goto send_ACK_with_skew; - case RXRPC_ACK_DELAY: - case RXRPC_ACK_IDLE: - goto send_ACK; - } + goto recheck_state; } - /* handle completion of security negotiations on an incoming - * connection */ - if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { - _debug("secured"); - spin_lock_bh(&call->lock); - - if (call->state == RXRPC_CALL_SERVER_SECURING) { - struct rxrpc_sock *rx; - _debug("securing"); - rcu_read_lock(); - rx = rcu_dereference(call->socket); - if (rx) { - write_lock(&rx->call_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) { - _debug("not released"); - call->state = RXRPC_CALL_SERVER_ACCEPTING; - list_move_tail(&call->accept_link, - &rx->acceptq); - } - write_unlock(&rx->call_lock); - } - rcu_read_unlock(); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); - read_unlock(&call->state_lock); - } - - spin_unlock_bh(&call->lock); - if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) - goto maybe_reschedule; - } - - /* post a notification of an acceptable connection to the app */ - if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { - _debug("post accept"); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, - 0, false) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); - goto maybe_reschedule; - } - - /* handle incoming call acceptance */ - if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { - _debug("accepted"); - ASSERTCMP(call->rx_data_post, ==, 0); - call->rx_data_post = 1; - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); - read_unlock_bh(&call->state_lock); - } - - /* drain the out of sequence received packet queue into the packet Rx - * queue */ - if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { - while (call->rx_data_post == call->rx_first_oos) - if (rxrpc_drain_rx_oos_queue(call) < 0) - break; - goto maybe_reschedule; - } + rxrpc_set_timer(call); /* other events may have been raised since we started checking */ - goto maybe_reschedule; - -send_ACK_with_skew: - ack.maxSkew = htons(call->ackr_skew); -send_ACK: - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(rxrpc_rx_window_size); - - /* permit the peer to send us jumbo packets if it wants to */ - ackinfo.rxMTU = htonl(rxrpc_rx_mtu); - ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); - - serial = atomic_inc_return(&call->conn->serial); - whdr.serial = htonl(serial); - _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - serial, - ntohs(ack.maxSkew), - ntohl(ack.firstPacket), - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks(ack.reason), - ack.nAcks); - - del_timer_sync(&call->ack_timer); - if (ack.nAcks > 0) - set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); - goto send_message_2; - -send_message: - _debug("send message"); - - serial = atomic_inc_return(&call->conn->serial); - whdr.serial = htonl(serial); - _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); -send_message_2: - - len = iov[0].iov_len; - ioc = 1; - if (iov[4].iov_len) { - ioc = 5; - len += iov[4].iov_len; - len += iov[3].iov_len; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[3].iov_len) { - ioc = 4; - len += iov[3].iov_len; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[2].iov_len) { - ioc = 3; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[1].iov_len) { - ioc = 2; - len += iov[1].iov_len; - } - - ret = kernel_sendmsg(call->conn->params.local->socket, - &msg, iov, ioc, len); - if (ret < 0) { - _debug("sendmsg failed: %d", ret); - if (call->state < RXRPC_CALL_COMPLETE) - requeue = true; - goto error; - } - - switch (genbit) { - case RXRPC_CALL_EV_ABORT: - clear_bit(genbit, &call->events); - clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - goto kill_ACKs; - - case RXRPC_CALL_EV_ACK_FINAL: - rxrpc_call_completed(call); - goto kill_ACKs; - - default: - clear_bit(genbit, &call->events); - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - _debug("start ACK timer"); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, - call->ackr_skew, call->ackr_serial, - false); - default: - break; - } - goto maybe_reschedule; - } - -kill_ACKs: - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - -maybe_reschedule: - if (call->events || !skb_queue_empty(&call->rx_queue)) { - if (call->state < RXRPC_CALL_COMPLETE) - requeue = true; - } - -error: - kfree(acks); - - if ((requeue || call->events) && !work_pending(&call->processor)) { - _debug("jumpstart %x", call->conn->proto.cid); + if (call->events && call->state < RXRPC_CALL_COMPLETE) { __rxrpc_queue_call(call); - } else { - rxrpc_put_call(call, rxrpc_call_put); + goto out; } +out_put: + rxrpc_put_call(call, rxrpc_call_put); +out: _leave(""); - return; - -no_mem: - _debug("out of memory"); - goto maybe_reschedule; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d233adc9b5e5..18ab13f82f6e 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -30,7 +30,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", - [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", @@ -43,7 +42,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_SUCCEEDED] = "Complete", - [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", [RXRPC_CALL_LOCAL_ERROR] = "LocError", @@ -57,10 +55,8 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { [rxrpc_call_queued_ref] = "QUR", [rxrpc_call_seen] = "SEE", [rxrpc_call_got] = "GOT", - [rxrpc_call_got_skb] = "Gsk", [rxrpc_call_got_userid] = "Gus", [rxrpc_call_put] = "PUT", - [rxrpc_call_put_skb] = "Psk", [rxrpc_call_put_userid] = "Pus", [rxrpc_call_put_noqueue] = "PNQ", }; @@ -69,9 +65,15 @@ struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); -static void rxrpc_call_life_expired(unsigned long _call); -static void rxrpc_ack_time_expired(unsigned long _call); -static void rxrpc_resend_time_expired(unsigned long _call); +static void rxrpc_call_timer_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *)_call; + + _enter("%d", call->debug_id); + + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_queue_call(call); +} /* * find an extant server call @@ -121,27 +123,24 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) if (!call) return NULL; - call->acks_winsz = 16; - call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), + call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE, + sizeof(struct sk_buff *), gfp); - if (!call->acks_window) { - kmem_cache_free(rxrpc_call_jar, call); - return NULL; - } + if (!call->rxtx_buffer) + goto nomem; - setup_timer(&call->lifetimer, &rxrpc_call_life_expired, - (unsigned long) call); - setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, - (unsigned long) call); - setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, - (unsigned long) call); + call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp); + if (!call->rxtx_annotations) + goto nomem_2; + + setup_timer(&call->timer, rxrpc_call_timer_expired, + (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->chan_wait_link); INIT_LIST_HEAD(&call->accept_link); - skb_queue_head_init(&call->rx_queue); - skb_queue_head_init(&call->rx_oos_queue); - skb_queue_head_init(&call->knlrecv_queue); + INIT_LIST_HEAD(&call->recvmsg_link); + INIT_LIST_HEAD(&call->sock_link); init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); rwlock_init(&call->state_lock); @@ -150,63 +149,52 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) memset(&call->sock_node, 0xed, sizeof(call->sock_node)); - call->rx_data_expect = 1; - call->rx_data_eaten = 0; - call->rx_first_oos = 0; - call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; - call->creation_jif = jiffies; + /* Leave space in the ring to handle a maxed-out jumbo packet */ + call->rx_winsize = RXRPC_RXTX_BUFF_SIZE - 1 - 46; + call->tx_winsize = 16; + call->rx_expect_next = 1; return call; + +nomem_2: + kfree(call->rxtx_buffer); +nomem: + kmem_cache_free(rxrpc_call_jar, call); + return NULL; } /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, - struct sockaddr_rxrpc *srx, +static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; _enter(""); - ASSERT(rx->local != NULL); - call = rxrpc_alloc_call(gfp); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; - call->rx_data_post = 1; call->service_id = srx->srx_service; - rcu_assign_pointer(call->socket, rx); _leave(" = %p", call); return call; } /* - * Begin client call. + * Initiate the call ack/resend/expiry timer. */ -static int rxrpc_begin_client_call(struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static void rxrpc_start_call_timer(struct rxrpc_call *call) { - int ret; - - /* Set up or get a connection record and set the protocol parameters, - * including channel number and call ID. - */ - ret = rxrpc_connect_call(call, cp, srx, gfp); - if (ret < 0) - return ret; - - spin_lock(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets); - spin_unlock(&call->conn->params.peer->lock); - - call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; - add_timer(&call->lifetimer); - return 0; + unsigned long expire_at; + + expire_at = jiffies + rxrpc_max_call_lifetime; + call->expire_at = expire_at; + call->ack_at = expire_at; + call->resend_at = expire_at; + call->timer.expires = expire_at; + add_timer(&call->timer); } /* @@ -226,7 +214,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, user_call_ID); - call = rxrpc_alloc_client_call(rx, srx, gfp); + call = rxrpc_alloc_client_call(srx, gfp); if (IS_ERR(call)) { _leave(" = %ld", PTR_ERR(call)); return call; @@ -255,19 +243,32 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, goto found_user_ID_now_present; } + rcu_assign_pointer(call->socket, rx); rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); + list_add(&call->sock_link, &rx->sock_calls); + write_unlock(&rx->call_lock); - write_lock_bh(&rxrpc_call_lock); + write_lock(&rxrpc_call_lock); list_add_tail(&call->link, &rxrpc_calls); - write_unlock_bh(&rxrpc_call_lock); + write_unlock(&rxrpc_call_lock); - ret = rxrpc_begin_client_call(call, cp, srx, gfp); + /* Set up or get a connection record and set the protocol parameters, + * including channel number and call ID. + */ + ret = rxrpc_connect_call(call, cp, srx, gfp); if (ret < 0) goto error; + spin_lock_bh(&call->conn->params.peer->lock); + hlist_add_head(&call->error_link, + &call->conn->params.peer->error_targets); + spin_unlock_bh(&call->conn->params.peer->lock); + + rxrpc_start_call_timer(call); + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); _leave(" = %p [new]", call); @@ -279,9 +280,9 @@ error: write_unlock(&rx->call_lock); rxrpc_put_call(call, rxrpc_call_put_userid); - write_lock_bh(&rxrpc_call_lock); + write_lock(&rxrpc_call_lock); list_del_init(&call->link); - write_unlock_bh(&rxrpc_call_lock); + write_unlock(&rxrpc_call_lock); error_out: __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, @@ -303,142 +304,46 @@ found_user_ID_now_present: } /* - * set up an incoming call - * - called in process context with IRQs enabled + * Set up an incoming call. call->conn points to the connection. + * This is called in BH context and isn't allowed to fail. */ -struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, - struct rxrpc_connection *conn, - struct sk_buff *skb) +void rxrpc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct sk_buff *skb) { + struct rxrpc_connection *conn = call->conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call, *candidate; - const void *here = __builtin_return_address(0); - u32 call_id, chan; - - _enter(",%d", conn->debug_id); - - ASSERT(rx != NULL); - - candidate = rxrpc_alloc_call(GFP_NOIO); - if (!candidate) - return ERR_PTR(-EBUSY); + u32 chan; - trace_rxrpc_call(candidate, rxrpc_call_new_service, - atomic_read(&candidate->usage), here, NULL); + _enter(",%d", call->conn->debug_id); - chan = sp->hdr.cid & RXRPC_CHANNELMASK; - candidate->conn = conn; - candidate->peer = conn->params.peer; - candidate->cid = sp->hdr.cid; - candidate->call_id = sp->hdr.callNumber; - candidate->security_ix = sp->hdr.securityIndex; - candidate->rx_data_post = 0; - candidate->state = RXRPC_CALL_SERVER_ACCEPTING; - candidate->flags |= (1 << RXRPC_CALL_IS_SERVICE); - if (conn->security_ix > 0) - candidate->state = RXRPC_CALL_SERVER_SECURING; - rcu_assign_pointer(candidate->socket, rx); - - spin_lock(&conn->channel_lock); - - /* set the channel for this call */ - call = rcu_dereference_protected(conn->channels[chan].call, - lockdep_is_held(&conn->channel_lock)); - - _debug("channel[%u] is %p", candidate->cid & RXRPC_CHANNELMASK, call); - if (call && call->call_id == sp->hdr.callNumber) { - /* already set; must've been a duplicate packet */ - _debug("extant call [%d]", call->state); - ASSERTCMP(call->conn, ==, conn); - - read_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) - rxrpc_queue_call(call); - case RXRPC_CALL_REMOTELY_ABORTED: - read_unlock(&call->state_lock); - goto aborted_call; - default: - rxrpc_get_call(call, rxrpc_call_got); - read_unlock(&call->state_lock); - goto extant_call; - } - } - - if (call) { - /* it seems the channel is still in use from the previous call - * - ditch the old binding if its call is now complete */ - _debug("CALL: %u { %s }", - call->debug_id, rxrpc_call_states[call->state]); - - if (call->state == RXRPC_CALL_COMPLETE) { - __rxrpc_disconnect_call(conn, call); - } else { - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -EBUSY"); - return ERR_PTR(-EBUSY); - } - } - - /* check the call number isn't duplicate */ - _debug("check dup"); - call_id = sp->hdr.callNumber; - - /* We just ignore calls prior to the current call ID. Terminated calls - * are handled via the connection. + rcu_assign_pointer(call->socket, rx); + call->call_id = sp->hdr.callNumber; + call->service_id = sp->hdr.serviceId; + call->cid = sp->hdr.cid; + call->state = RXRPC_CALL_SERVER_ACCEPTING; + if (sp->hdr.securityIndex > 0) + call->state = RXRPC_CALL_SERVER_SECURING; + + /* Set the channel for this call. We don't get channel_lock as we're + * only defending against the data_ready handler (which we're called + * from) and the RESPONSE packet parser (which is only really + * interested in call_counter and can cope with a disagreement with the + * call pointer). */ - if (call_id <= conn->channels[chan].call_counter) - goto old_call; /* TODO: Just drop packet */ - - /* Temporary: Mirror the backlog prealloc ref (TODO: use prealloc) */ - rxrpc_get_call(candidate, rxrpc_call_got); - - /* make the call available */ - _debug("new call"); - call = candidate; - candidate = NULL; - conn->channels[chan].call_counter = call_id; + chan = sp->hdr.cid & RXRPC_CHANNELMASK; + conn->channels[chan].call_counter = call->call_id; + conn->channels[chan].call_id = call->call_id; rcu_assign_pointer(conn->channels[chan].call, call); - rxrpc_get_connection(conn); - rxrpc_get_peer(call->peer); - spin_unlock(&conn->channel_lock); spin_lock(&conn->params.peer->lock); hlist_add_head(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); - write_lock_bh(&rxrpc_call_lock); - list_add_tail(&call->link, &rxrpc_calls); - write_unlock_bh(&rxrpc_call_lock); - - call->service_id = conn->params.service_id; - _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); - call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; - add_timer(&call->lifetimer); - _leave(" = %p {%d} [new]", call, call->debug_id); - return call; - -extant_call: - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); - return call; - -aborted_call: - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -ECONNABORTED"); - return ERR_PTR(-ECONNABORTED); - -old_call: - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -ECONNRESET [old]"); - return ERR_PTR(-ECONNRESET); + rxrpc_start_call_timer(call); + _leave(""); } /* @@ -497,25 +402,17 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) } /* - * Note the addition of a ref on a call for a socket buffer. + * Detach a call from its owning socket. */ -void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) +void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { - const void *here = __builtin_return_address(0); - int n = atomic_inc_return(&call->usage); + struct rxrpc_connection *conn = call->conn; + bool put = false; + int i; - trace_rxrpc_call(call, rxrpc_call_got_skb, n, here, skb); -} + _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); -/* - * detach a call from a socket and set up for release - */ -void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) -{ - _enter("{%d,%d,%d,%d}", - call->debug_id, atomic_read(&call->usage), - atomic_read(&call->ackr_not_idle), - call->rx_first_oos); + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); rxrpc_see_call(call); @@ -524,80 +421,46 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) BUG(); spin_unlock_bh(&call->lock); - /* dissociate from the socket - * - the socket's ref on the call is passed to the death timer - */ - _debug("RELEASE CALL %p (%d)", call, call->debug_id); + del_timer_sync(&call->timer); - if (call->peer) { - spin_lock(&call->peer->lock); - hlist_del_init(&call->error_link); - spin_unlock(&call->peer->lock); - } + /* Make sure we don't get any more notifications */ + write_lock_bh(&rx->recvmsg_lock); - write_lock_bh(&rx->call_lock); - if (!list_empty(&call->accept_link)) { + if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", call, call->events, call->flags); - ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + list_del(&call->recvmsg_link); + put = true; + } + + /* list_empty() must return false in rxrpc_notify_socket() */ + call->recvmsg_link.next = NULL; + call->recvmsg_link.prev = NULL; + + write_unlock_bh(&rx->recvmsg_lock); + if (put) + rxrpc_put_call(call, rxrpc_call_put); + + write_lock(&rx->call_lock); + + if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); - clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_put_call(call, rxrpc_call_put_userid); } - write_unlock_bh(&rx->call_lock); - - /* free up the channel for reuse */ - if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) { - clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); - rxrpc_call_completed(call); - } else { - write_lock_bh(&call->state_lock); - - if (call->state < RXRPC_CALL_COMPLETE) { - _debug("+++ ABORTING STATE %d +++\n", call->state); - __rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET); - clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); - } - - write_unlock_bh(&call->state_lock); - } - if (call->conn) + list_del(&call->sock_link); + write_unlock(&rx->call_lock); + + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + + if (conn) rxrpc_disconnect_call(call); - /* clean up the Rx queue */ - if (!skb_queue_empty(&call->rx_queue) || - !skb_queue_empty(&call->rx_oos_queue)) { - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - - _debug("purge Rx queues"); - - spin_lock_bh(&call->lock); - while ((skb = skb_dequeue(&call->rx_queue)) || - (skb = skb_dequeue(&call->rx_oos_queue))) { - spin_unlock_bh(&call->lock); - - sp = rxrpc_skb(skb); - _debug("- zap %s %%%u #%u", - rxrpc_pkts[sp->hdr.type], - sp->hdr.serial, sp->hdr.seq); - rxrpc_free_skb(skb); - spin_lock_bh(&call->lock); - } - spin_unlock_bh(&call->lock); + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { + rxrpc_free_skb(call->rxtx_buffer[i]); + call->rxtx_buffer[i] = NULL; } - rxrpc_purge_queue(&call->knlrecv_queue); - - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - del_timer_sync(&call->lifetimer); /* We have to release the prealloc backlog ref */ if (rxrpc_is_service_call(call)) @@ -611,28 +474,19 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) { struct rxrpc_call *call; - struct rb_node *p; _enter("%p", rx); - read_lock_bh(&rx->call_lock); - - /* kill the not-yet-accepted incoming calls */ - list_for_each_entry(call, &rx->secureq, accept_link) { - rxrpc_release_call(rx, call); - } - - list_for_each_entry(call, &rx->acceptq, accept_link) { - rxrpc_release_call(rx, call); - } - - /* mark all the calls as no longer wanting incoming packets */ - for (p = rb_first(&rx->calls); p; p = rb_next(p)) { - call = rb_entry(p, struct rxrpc_call, sock_node); + while (!list_empty(&rx->sock_calls)) { + call = list_entry(rx->sock_calls.next, + struct rxrpc_call, sock_link); + rxrpc_get_call(call, rxrpc_call_got); + rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); } - read_unlock_bh(&rx->call_lock); _leave(""); } @@ -651,23 +505,12 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); - rxrpc_cleanup_call(call); - } -} + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); -/* - * Release a call ref held by a socket buffer. - */ -void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb) -{ - const void *here = __builtin_return_address(0); - int n; + write_lock(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock(&rxrpc_call_lock); - n = atomic_dec_return(&call->usage); - trace_rxrpc_call(call, rxrpc_call_put_skb, n, here, skb); - ASSERTCMP(n, >=, 0); - if (n == 0) { - _debug("call %d dead", call->debug_id); rxrpc_cleanup_call(call); } } @@ -679,9 +522,9 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); - rxrpc_purge_queue(&call->rx_queue); - rxrpc_purge_queue(&call->knlrecv_queue); rxrpc_put_peer(call->peer); + kfree(call->rxtx_buffer); + kfree(call->rxtx_annotations); kmem_cache_free(rxrpc_call_jar, call); } @@ -690,49 +533,24 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) */ void rxrpc_cleanup_call(struct rxrpc_call *call) { - _net("DESTROY CALL %d", call->debug_id); + int i; - write_lock_bh(&rxrpc_call_lock); - list_del_init(&call->link); - write_unlock_bh(&rxrpc_call_lock); + _net("DESTROY CALL %d", call->debug_id); memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - del_timer_sync(&call->lifetimer); - del_timer_sync(&call->ack_timer); - del_timer_sync(&call->resend_timer); + del_timer_sync(&call->timer); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - ASSERT(!work_pending(&call->processor)); ASSERTCMP(call->conn, ==, NULL); - if (call->acks_window) { - _debug("kill Tx window %d", - CIRC_CNT(call->acks_head, call->acks_tail, - call->acks_winsz)); - smp_mb(); - while (CIRC_CNT(call->acks_head, call->acks_tail, - call->acks_winsz) > 0) { - struct rxrpc_skb_priv *sp; - unsigned long _skb; - - _skb = call->acks_window[call->acks_tail] & ~1; - sp = rxrpc_skb((struct sk_buff *)_skb); - _debug("+++ clear Tx %u", sp->hdr.seq); - rxrpc_free_skb((struct sk_buff *)_skb); - call->acks_tail = - (call->acks_tail + 1) & (call->acks_winsz - 1); - } - - kfree(call->acks_window); - } + /* Clean up the Rx/Tx buffer */ + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) + rxrpc_free_skb(call->rxtx_buffer[i]); rxrpc_free_skb(call->tx_pending); - rxrpc_purge_queue(&call->rx_queue); - ASSERT(skb_queue_empty(&call->rx_oos_queue)); - rxrpc_purge_queue(&call->knlrecv_queue); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } @@ -747,8 +565,8 @@ void __exit rxrpc_destroy_all_calls(void) if (list_empty(&rxrpc_calls)) return; - - write_lock_bh(&rxrpc_call_lock); + + write_lock(&rxrpc_call_lock); while (!list_empty(&rxrpc_calls)) { call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); @@ -757,74 +575,15 @@ void __exit rxrpc_destroy_all_calls(void) rxrpc_see_call(call); list_del_init(&call->link); - pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n", + pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, atomic_read(&call->usage), - atomic_read(&call->ackr_not_idle), rxrpc_call_states[call->state], call->flags, call->events); - if (!skb_queue_empty(&call->rx_queue)) - pr_err("Rx queue occupied\n"); - if (!skb_queue_empty(&call->rx_oos_queue)) - pr_err("OOS queue occupied\n"); - write_unlock_bh(&rxrpc_call_lock); + write_unlock(&rxrpc_call_lock); cond_resched(); - write_lock_bh(&rxrpc_call_lock); + write_lock(&rxrpc_call_lock); } - write_unlock_bh(&rxrpc_call_lock); - _leave(""); -} - -/* - * handle call lifetime being exceeded - */ -static void rxrpc_call_life_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - rxrpc_see_call(call); - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); - rxrpc_queue_call(call); -} - -/* - * handle resend timer expiry - * - may not take call->state_lock as this can deadlock against del_timer_sync() - */ -static void rxrpc_resend_time_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - rxrpc_see_call(call); - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_queue_call(call); -} - -/* - * handle ACK timer expiry - */ -static void rxrpc_ack_time_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - rxrpc_see_call(call); - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - rxrpc_queue_call(call); + write_unlock(&rxrpc_call_lock); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 8c7938ba6a84..0691007cfc02 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -15,10 +15,6 @@ #include #include #include -#include -#include -#include -#include #include #include #include @@ -140,16 +136,10 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, u32 abort_code, int error) { struct rxrpc_call *call; - bool queue; - int i, bit; + int i; _enter("{%d},%x", conn->debug_id, abort_code); - if (compl == RXRPC_CALL_LOCALLY_ABORTED) - bit = RXRPC_CALL_EV_CONN_ABORT; - else - bit = RXRPC_CALL_EV_RCVD_ABORT; - spin_lock(&conn->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { @@ -157,22 +147,13 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, conn->channels[i].call, lockdep_is_held(&conn->channel_lock)); if (call) { - rxrpc_see_call(call); if (compl == RXRPC_CALL_LOCALLY_ABORTED) trace_rxrpc_abort("CON", call->cid, call->call_id, 0, abort_code, error); - - write_lock_bh(&call->state_lock); - if (rxrpc_set_call_completion(call, compl, abort_code, - error)) { - set_bit(bit, &call->events); - queue = true; - } - write_unlock_bh(&call->state_lock); - if (queue) - rxrpc_queue_call(call); - + if (rxrpc_set_call_completion(call, compl, + abort_code, error)) + rxrpc_notify_socket(call); } } @@ -251,17 +232,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, /* * mark a call as being on a now-secured channel - * - must be called with softirqs disabled + * - must be called with BH's disabled. */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { _enter("%p", call); if (call) { - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); + write_lock_bh(&call->state_lock); + if (call->state == RXRPC_CALL_SERVER_SECURING) { + call->state = RXRPC_CALL_SERVER_ACCEPTING; + rxrpc_notify_socket(call); + } + write_unlock_bh(&call->state_lock); } } @@ -278,7 +260,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, int loop, ret; if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - kleave(" = -ECONNABORTED [%u]", conn->state); + _leave(" = -ECONNABORTED [%u]", conn->state); return -ECONNABORTED; } @@ -291,14 +273,14 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return 0; case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) + if (skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) < 0) return -EPROTO; abort_code = ntohl(wtmp); _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); conn->state = RXRPC_CONN_REMOTELY_ABORTED; - rxrpc_abort_calls(conn, 0, RXRPC_CALL_REMOTELY_ABORTED, - abort_code); + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, + abort_code, ECONNABORTED); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -323,14 +305,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { conn->state = RXRPC_CONN_SERVICE; + spin_unlock(&conn->state_lock); for (loop = 0; loop < RXRPC_MAXCALLS; loop++) rxrpc_call_is_secure( rcu_dereference_protected( conn->channels[loop].call, lockdep_is_held(&conn->channel_lock))); + } else { + spin_unlock(&conn->state_lock); } - spin_unlock(&conn->state_lock); spin_unlock(&conn->channel_lock); return 0; @@ -433,88 +417,3 @@ protocol_error: _leave(" [EPROTO]"); goto out; } - -/* - * put a packet up for transport-level abort - */ -void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) -{ - CHECK_SLAB_OKAY(&local->usage); - - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_local(local); -} - -/* - * reject packets through the local endpoint - */ -void rxrpc_reject_packets(struct rxrpc_local *local) -{ - union { - struct sockaddr sa; - struct sockaddr_in sin; - } sa; - struct rxrpc_skb_priv *sp; - struct rxrpc_wire_header whdr; - struct sk_buff *skb; - struct msghdr msg; - struct kvec iov[2]; - size_t size; - __be32 code; - - _enter("%d", local->debug_id); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = &code; - iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); - - msg.msg_name = &sa; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&sa, 0, sizeof(sa)); - sa.sa.sa_family = local->srx.transport.family; - switch (sa.sa.sa_family) { - case AF_INET: - msg.msg_namelen = sizeof(sa.sin); - break; - default: - msg.msg_namelen = 0; - break; - } - - memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; - - while ((skb = skb_dequeue(&local->reject_queue))) { - rxrpc_see_skb(skb); - sp = rxrpc_skb(skb); - switch (sa.sa.sa_family) { - case AF_INET: - sa.sin.sin_port = udp_hdr(skb)->source; - sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - code = htonl(skb->priority); - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.serviceId = htons(sp->hdr.serviceId); - whdr.flags = sp->hdr.flags; - whdr.flags ^= RXRPC_CLIENT_INITIATED; - whdr.flags &= RXRPC_CLIENT_INITIATED; - - kernel_sendmsg(local->socket, &msg, iov, 2, size); - break; - - default: - break; - } - - rxrpc_free_skb(skb); - } - - _leave(""); -} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8da82e3aa00e..ffa9addb97b2 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -169,7 +169,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, chan->last_abort = call->abort_code; chan->last_type = RXRPC_PACKET_TYPE_ABORT; } else { - chan->last_seq = call->rx_data_eaten; + chan->last_seq = call->rx_hard_ack; chan->last_type = RXRPC_PACKET_TYPE_ACK; } /* Sync with rxrpc_conn_retransmit(). */ @@ -191,6 +191,10 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; + spin_lock_bh(&conn->params.peer->lock); + hlist_del_init(&call->error_link); + spin_unlock_bh(&conn->params.peer->lock); + if (rxrpc_is_client_call(call)) return rxrpc_disconnect_client_call(call); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 189338a60457..83d54da4ce8b 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -65,9 +65,8 @@ done: * Insert a service connection into a peer's tree, thereby making it a target * for incoming packets. */ -static struct rxrpc_connection * -rxrpc_publish_service_conn(struct rxrpc_peer *peer, - struct rxrpc_connection *conn) +static void rxrpc_publish_service_conn(struct rxrpc_peer *peer, + struct rxrpc_connection *conn) { struct rxrpc_connection *cursor = NULL; struct rxrpc_conn_proto k = conn->proto; @@ -96,7 +95,7 @@ conn_published: set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); write_sequnlock_bh(&peer->service_conn_lock); _leave(" = %d [new]", conn->debug_id); - return conn; + return; found_extant_conn: if (atomic_read(&cursor->usage) == 0) @@ -143,106 +142,30 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) } /* - * get a record of an incoming connection + * Set up an incoming connection. This is called in BH context with the RCU + * read lock held. */ -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, - struct sockaddr_rxrpc *srx, - struct sk_buff *skb) +void rxrpc_new_incoming_connection(struct rxrpc_connection *conn, + struct sk_buff *skb) { - struct rxrpc_connection *conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - const char *new = "old"; _enter(""); - peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (!peer) { - _debug("no peer"); - return ERR_PTR(-EBUSY); - } - - ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); - - rcu_read_lock(); - peer = rxrpc_lookup_peer_rcu(local, srx); - if (peer) { - conn = rxrpc_find_service_conn_rcu(peer, skb); - if (conn) { - if (sp->hdr.securityIndex != conn->security_ix) - goto security_mismatch_rcu; - if (rxrpc_get_connection_maybe(conn)) - goto found_extant_connection_rcu; - - /* The conn has expired but we can't remove it without - * the appropriate lock, so we attempt to replace it - * when we have a new candidate. - */ - } - - if (!rxrpc_get_peer_maybe(peer)) - peer = NULL; - } - rcu_read_unlock(); - - if (!peer) { - peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (!peer) - goto enomem; - } - - /* We don't have a matching record yet. */ - conn = rxrpc_alloc_connection(GFP_NOIO); - if (!conn) - goto enomem_peer; - conn->proto.epoch = sp->hdr.epoch; conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; - conn->params.local = local; - conn->params.peer = peer; conn->params.service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; - conn->state = RXRPC_CONN_SERVICE; - if (conn->params.service_id) + if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; - - rxrpc_get_local(local); - - /* We maintain an extra ref on the connection whilst it is on - * the rxrpc_connections list. - */ - atomic_set(&conn->usage, 2); - - write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); - write_unlock(&rxrpc_connection_lock); + else + conn->state = RXRPC_CONN_SERVICE; /* Make the connection a target for incoming packets. */ - rxrpc_publish_service_conn(peer, conn); - - new = "new"; - -success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return conn; - -found_extant_connection_rcu: - rcu_read_unlock(); - goto success; - -security_mismatch_rcu: - rcu_read_unlock(); - _leave(" = -EKEYREJECTED"); - return ERR_PTR(-EKEYREJECTED); + rxrpc_publish_service_conn(conn->params.peer, conn); -enomem_peer: - rxrpc_put_peer(peer); -enomem: - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid); } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 5906579060cd..afeba98004b1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1,6 +1,6 @@ /* RxRPC packet reception * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -27,549 +27,547 @@ #include #include "ar-internal.h" +static void rxrpc_proto_abort(const char *why, + struct rxrpc_call *call, rxrpc_seq_t seq) +{ + if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) { + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } +} + /* - * queue a packet for recvmsg to pass to userspace - * - the caller must hold a lock on call->lock - * - must not be called with interrupts disabled (sk_filter() disables BH's) - * - eats the packet whether successful or not - * - there must be just one reference to the packet, which the caller passes to - * this function + * Apply a hard ACK by advancing the Tx window. */ -int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, - bool force, bool terminal) +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) { - struct rxrpc_skb_priv *sp; - struct rxrpc_sock *rx; - struct sock *sk; - int ret; + struct sk_buff *skb, *list = NULL; + int ix; - _enter(",,%d,%d", force, terminal); + spin_lock(&call->lock); - ASSERT(!irqs_disabled()); + while (before(call->tx_hard_ack, to)) { + call->tx_hard_ack++; + ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; + skb = call->rxtx_buffer[ix]; + rxrpc_see_skb(skb); + call->rxtx_buffer[ix] = NULL; + call->rxtx_annotations[ix] = 0; + skb->next = list; + list = skb; + } - sp = rxrpc_skb(skb); - ASSERTCMP(sp->call, ==, call); + spin_unlock(&call->lock); - /* if we've already posted the terminal message for a call, then we - * don't post any more */ - if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { - _debug("already terminated"); - ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); + while (list) { + skb = list; + list = skb->next; + skb->next = NULL; rxrpc_free_skb(skb); - return 0; } +} - /* The socket may go away under us */ - ret = 0; - rcu_read_lock(); - rx = rcu_dereference(call->socket); - if (!rx) - goto out; - sk = &rx->sk; - if (sock_flag(sk, SOCK_DEAD)) - goto out; +/* + * End the transmission phase of a call. + * + * This occurs when we get an ACKALL packet, the first DATA packet of a reply, + * or a final ACK packet. + */ +static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why) +{ + _enter(""); - if (!force) { - /* cast skb->rcvbuf to unsigned... It's pointless, but - * reduces number of warnings when compiling with -W - * --ANK */ -// ret = -ENOBUFS; -// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -// (unsigned int) sk->sk_rcvbuf) -// goto out; - - ret = sk_filter(sk, skb); - if (ret < 0) - goto out; + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + return true; + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_SERVER_AWAIT_ACK: + break; + default: + rxrpc_proto_abort(abort_why, call, call->tx_top); + return false; } - spin_lock_bh(&sk->sk_receive_queue.lock); - if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && - !test_bit(RXRPC_CALL_RELEASED, &call->flags) && - sk->sk_state != RXRPC_CLOSE) { - skb->destructor = rxrpc_packet_destructor; - skb->dev = NULL; - skb->sk = sk; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); - - if (terminal) { - _debug("<<<< TERMINAL MESSAGE >>>>"); - set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); - } + rxrpc_rotate_tx_window(call, call->tx_top); - /* allow interception by a kernel service */ - if (skb->mark == RXRPC_SKB_MARK_NEW_CALL && - rx->notify_new_call) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - skb_queue_tail(&call->knlrecv_queue, skb); - rx->notify_new_call(&rx->sk, NULL, 0); - } else if (call->notify_rx) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - skb_queue_tail(&call->knlrecv_queue, skb); - call->notify_rx(&rx->sk, call, call->user_call_ID); - } else { - _net("post skb %p", skb); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); + write_lock(&call->state_lock); - sk->sk_data_ready(sk); - } - skb = NULL; - } else { - spin_unlock_bh(&sk->sk_receive_queue.lock); + switch (call->state) { + default: + break; + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + break; + case RXRPC_CALL_SERVER_AWAIT_ACK: + __rxrpc_call_completed(call); + rxrpc_notify_socket(call); + break; } - ret = 0; -out: - rxrpc_free_skb(skb); - rcu_read_unlock(); + write_unlock(&call->state_lock); + _leave(" = ok"); + return true; +} + +/* + * Scan a jumbo packet to validate its structure and to work out how many + * subpackets it contains. + * + * A jumbo packet is a collection of consecutive packets glued together with + * little headers between that indicate how to change the initial header for + * each subpacket. + * + * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but + * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any + * size. + */ +static bool rxrpc_validate_jumbo(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int offset = sp->offset; + unsigned int len = skb->data_len; + int nr_jumbo = 1; + u8 flags = sp->hdr.flags; + + do { + nr_jumbo++; + if (len - offset < RXRPC_JUMBO_SUBPKTLEN) + goto protocol_error; + if (flags & RXRPC_LAST_PACKET) + goto protocol_error; + offset += RXRPC_JUMBO_DATALEN; + if (skb_copy_bits(skb, offset, &flags, 1) < 0) + goto protocol_error; + offset += sizeof(struct rxrpc_jumbo_header); + } while (flags & RXRPC_JUMBO_PACKET); + + sp->nr_jumbo = nr_jumbo; + return true; - _leave(" = %d", ret); - return ret; +protocol_error: + return false; } /* - * process a DATA packet, posting the packet to the appropriate queue - * - eats the packet if successful + * Handle reception of a duplicate packet. + * + * We have to take care to avoid an attack here whereby we're given a series of + * jumbograms, each with a sequence number one before the preceding one and + * filled up to maximum UDP size. If they never send us the first packet in + * the sequence, they can cause us to have to hold on to around 2MiB of kernel + * space until the call times out. + * + * We limit the space usage by only accepting three duplicate jumbo packets per + * call. After that, we tell the other side we're no longer accepting jumbos + * (that information is encoded in the ACK packet). */ -static int rxrpc_fast_process_data(struct rxrpc_call *call, - struct sk_buff *skb, u32 seq) +static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq, + u8 annotation, bool *_jumbo_dup) { - struct rxrpc_skb_priv *sp; - bool terminal; - int ret, ackbit, ack; - u32 serial; - u16 skew; - u8 flags; + /* Discard normal packets that are duplicates. */ + if (annotation == 0) + return; - _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); + /* Skip jumbo subpackets that are duplicates. When we've had three or + * more partially duplicate jumbo packets, we refuse to take any more + * jumbos for this call. + */ + if (!*_jumbo_dup) { + call->nr_jumbo_dup++; + *_jumbo_dup = true; + } +} - sp = rxrpc_skb(skb); - ASSERTCMP(sp->call, ==, NULL); - flags = sp->hdr.flags; - serial = sp->hdr.serial; - skew = skb->priority; +/* + * Process a DATA packet, adding the packet to the Rx ring. + */ +static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, + u16 skew) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int offset = sp->offset; + unsigned int ix; + rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; + rxrpc_seq_t seq = sp->hdr.seq, hard_ack; + bool immediate_ack = false, jumbo_dup = false, queued; + u16 len; + u8 ack = 0, flags, annotation = 0; - spin_lock(&call->lock); + _enter("{%u,%u},{%u,%u}", + call->rx_hard_ack, call->rx_top, skb->data_len, seq); - if (call->state > RXRPC_CALL_COMPLETE) - goto discard; + _proto("Rx DATA %%%u { #%u f=%02x }", + sp->hdr.serial, seq, sp->hdr.flags); - ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); - ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); - ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); + if (call->state >= RXRPC_CALL_COMPLETE) + return; - if (seq < call->rx_data_post) { - _debug("dup #%u [-%u]", seq, call->rx_data_post); - ack = RXRPC_ACK_DUPLICATE; - ret = -ENOBUFS; - goto discard_and_ack; - } + /* Received data implicitly ACKs all of the request packets we sent + * when we're acting as a client. + */ + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY && + !rxrpc_end_tx_phase(call, "ETD")) + return; - /* we may already have the packet in the out of sequence queue */ - ackbit = seq - (call->rx_data_eaten + 1); - ASSERTCMP(ackbit, >=, 0); - if (__test_and_set_bit(ackbit, call->ackr_window)) { - _debug("dup oos #%u [%u,%u]", - seq, call->rx_data_eaten, call->rx_data_post); - ack = RXRPC_ACK_DUPLICATE; - goto discard_and_ack; - } + call->ackr_prev_seq = seq; - if (seq >= call->ackr_win_top) { - _debug("exceed #%u [%u]", seq, call->ackr_win_top); - __clear_bit(ackbit, call->ackr_window); + hard_ack = READ_ONCE(call->rx_hard_ack); + if (after(seq, hard_ack + call->rx_winsize)) { ack = RXRPC_ACK_EXCEEDS_WINDOW; - goto discard_and_ack; + ack_serial = serial; + goto ack; } - if (seq == call->rx_data_expect) { - clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); - call->rx_data_expect++; - } else if (seq > call->rx_data_expect) { - _debug("oos #%u [%u]", seq, call->rx_data_expect); - call->rx_data_expect = seq + 1; - if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { - ack = RXRPC_ACK_OUT_OF_SEQUENCE; - goto enqueue_and_ack; + flags = sp->hdr.flags; + if (flags & RXRPC_JUMBO_PACKET) { + if (call->nr_jumbo_dup > 3) { + ack = RXRPC_ACK_NOSPACE; + ack_serial = serial; + goto ack; } - goto enqueue_packet; + annotation = 1; } - if (seq != call->rx_data_post) { - _debug("ahead #%u [%u]", seq, call->rx_data_post); - goto enqueue_packet; +next_subpacket: + queued = false; + ix = seq & RXRPC_RXTX_BUFF_MASK; + len = skb->data_len; + if (flags & RXRPC_JUMBO_PACKET) + len = RXRPC_JUMBO_DATALEN; + + if (flags & RXRPC_LAST_PACKET) { + if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) && + seq != call->rx_top) + return rxrpc_proto_abort("LSN", call, seq); + } else { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + after_eq(seq, call->rx_top)) + return rxrpc_proto_abort("LSA", call, seq); } - if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) - goto protocol_error; - - /* if the packet need security things doing to it, then it goes down - * the slow path */ - if (call->security_ix) - goto enqueue_packet; - - sp->call = call; - rxrpc_get_call_for_skb(call, skb); - terminal = ((flags & RXRPC_LAST_PACKET) && - !(flags & RXRPC_CLIENT_INITIATED)); - ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOBUFS) { - __clear_bit(ackbit, call->ackr_window); - ack = RXRPC_ACK_NOSPACE; - goto discard_and_ack; + if (before_eq(seq, hard_ack)) { + ack = RXRPC_ACK_DUPLICATE; + ack_serial = serial; + goto skip; + } + + if (flags & RXRPC_REQUEST_ACK && !ack) { + ack = RXRPC_ACK_REQUESTED; + ack_serial = serial; + } + + if (call->rxtx_buffer[ix]) { + rxrpc_input_dup_data(call, seq, annotation, &jumbo_dup); + if (ack != RXRPC_ACK_DUPLICATE) { + ack = RXRPC_ACK_DUPLICATE; + ack_serial = serial; } - goto out; + immediate_ack = true; + goto skip; } - skb = NULL; - sp = NULL; - - _debug("post #%u", seq); - ASSERTCMP(call->rx_data_post, ==, seq); - call->rx_data_post++; - - if (flags & RXRPC_LAST_PACKET) - set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); - - /* if we've reached an out of sequence packet then we need to drain - * that queue into the socket Rx queue now */ - if (call->rx_data_post == call->rx_first_oos) { - _debug("drain rx oos now"); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); + /* Queue the packet. We use a couple of memory barriers here as need + * to make sure that rx_top is perceived to be set after the buffer + * pointer and that the buffer pointer is set after the annotation and + * the skb data. + * + * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() + * and also rxrpc_fill_out_ack(). + */ + rxrpc_get_skb(skb); + call->rxtx_annotations[ix] = annotation; + smp_wmb(); + call->rxtx_buffer[ix] = skb; + if (after(seq, call->rx_top)) + smp_store_release(&call->rx_top, seq); + queued = true; + + if (after_eq(seq, call->rx_expect_next)) { + if (after(seq, call->rx_expect_next)) { + _net("OOS %u > %u", seq, call->rx_expect_next); + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + ack_serial = serial; + } + call->rx_expect_next = seq + 1; } - spin_unlock(&call->lock); - atomic_inc(&call->ackr_not_idle); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, false); - _leave(" = 0 [posted]"); - return 0; +skip: + offset += len; + if (flags & RXRPC_JUMBO_PACKET) { + if (skb_copy_bits(skb, offset, &flags, 1) < 0) + return rxrpc_proto_abort("XJF", call, seq); + offset += sizeof(struct rxrpc_jumbo_header); + seq++; + serial++; + annotation++; + if (flags & RXRPC_JUMBO_PACKET) + annotation |= RXRPC_RX_ANNO_JLAST; + + _proto("Rx DATA Jumbo %%%u", serial); + goto next_subpacket; + } -protocol_error: - ret = -EBADMSG; -out: - spin_unlock(&call->lock); - _leave(" = %d", ret); - return ret; + if (queued && flags & RXRPC_LAST_PACKET && !ack) { + ack = RXRPC_ACK_DELAY; + ack_serial = serial; + } -discard_and_ack: - _debug("discard and ACK packet %p", skb); - __rxrpc_propose_ACK(call, ack, skew, serial, true); -discard: - spin_unlock(&call->lock); - rxrpc_free_skb(skb); - _leave(" = 0 [discarded]"); - return 0; +ack: + if (ack) + rxrpc_propose_ACK(call, ack, skew, ack_serial, + immediate_ack, true); -enqueue_and_ack: - __rxrpc_propose_ACK(call, ack, skew, serial, true); -enqueue_packet: - _net("defer skb %p", skb); - spin_unlock(&call->lock); - skb_queue_tail(&call->rx_queue, skb); - atomic_inc(&call->ackr_not_idle); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); - _leave(" = 0 [queued]"); - return 0; + if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) + rxrpc_notify_socket(call); + _leave(" [queued]"); } /* - * assume an implicit ACKALL of the transmission phase of a client socket upon - * reception of the first reply packet + * Process the extra information that may be appended to an ACK packet */ -static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) +static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, + struct rxrpc_ackinfo *ackinfo) { - write_lock_bh(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; - call->acks_latest = serial; - - _debug("implicit ACKALL %%%u", call->acks_latest); - set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); - write_unlock_bh(&call->state_lock); + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_peer *peer; + unsigned int mtu; + + _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", + sp->hdr.serial, + ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), + ntohl(ackinfo->rwind), ntohl(ackinfo->jumbo_max)); + + if (call->tx_winsize > ntohl(ackinfo->rwind)) + call->tx_winsize = ntohl(ackinfo->rwind); + + mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU)); + + peer = call->peer; + if (mtu < peer->maxdata) { + spin_lock_bh(&peer->lock); + peer->maxdata = mtu; + peer->mtu = mtu + peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); + } +} - if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_EV_RESEND, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); +/* + * Process individual soft ACKs. + * + * Each ACK in the array corresponds to one packet and can be either an ACK or + * a NAK. If we get find an explicitly NAK'd packet we resend immediately; + * packets that lie beyond the end of the ACK list are scheduled for resend by + * the timer on the basis that the peer might just not have processed them at + * the time the ACK was sent. + */ +static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, + rxrpc_seq_t seq, int nr_acks) +{ + bool resend = false; + int ix; + + for (; nr_acks > 0; nr_acks--, seq++) { + ix = seq & RXRPC_RXTX_BUFF_MASK; + switch (*acks) { + case RXRPC_ACK_TYPE_ACK: + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK; + break; + case RXRPC_ACK_TYPE_NACK: + if (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK) + continue; + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK; + resend = true; + break; + default: + return rxrpc_proto_abort("SFT", call, 0); } - break; - - default: - write_unlock_bh(&call->state_lock); - break; } + + if (resend && + !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); } /* - * post an incoming packet to the nominated call to deal with - * - must get rid of the sk_buff, either by freeing it or by queuing it + * Process an ACK packet. + * + * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet + * in the ACK array. Anything before that is hard-ACK'd and may be discarded. + * + * A hard-ACK means that a packet has been processed and may be discarded; a + * soft-ACK means that the packet may be discarded and retransmission + * requested. A phase is complete when all packets are hard-ACK'd. */ -void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) +static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, + u16 skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 wtmp; - u32 abort_code; - - _enter("%p,%p", call, skb); - - ASSERT(!irqs_disabled()); - -#if 0 // INJECT RX ERROR - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - static int skip = 0; - if (++skip == 3) { - printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); - skip = 0; - goto free_packet; - } + union { + struct rxrpc_ackpacket ack; + struct rxrpc_ackinfo info; + u8 acks[RXRPC_MAXACKS]; + } buf; + rxrpc_seq_t first_soft_ack, hard_ack; + int nr_acks, offset; + + _enter(""); + + if (skb_copy_bits(skb, sp->offset, &buf.ack, sizeof(buf.ack)) < 0) { + _debug("extraction failure"); + return rxrpc_proto_abort("XAK", call, 0); } -#endif - - /* request ACK generation for any ACK or DATA packet that requests - * it */ - if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - _proto("ACK Requested on %%%u", sp->hdr.serial); + sp->offset += sizeof(buf.ack); + + first_soft_ack = ntohl(buf.ack.firstPacket); + hard_ack = first_soft_ack - 1; + nr_acks = buf.ack.nAcks; + + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + sp->hdr.serial, + ntohs(buf.ack.maxSkew), + first_soft_ack, + ntohl(buf.ack.previousPacket), + ntohl(buf.ack.serial), + rxrpc_acks(buf.ack.reason), + buf.ack.nAcks); + + if (buf.ack.reason == RXRPC_ACK_PING) { + _proto("Rx ACK %%%u PING Request", sp->hdr.serial); + rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, + skew, sp->hdr.serial, true, true); + } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, - skb->priority, sp->hdr.serial, false); + skew, sp->hdr.serial, true, true); } - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_ABORT: - _debug("abort"); - - if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) - goto protocol_error; - - abort_code = ntohl(wtmp); - _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); - - if (__rxrpc_set_call_completion(call, - RXRPC_CALL_REMOTELY_ABORTED, - abort_code, ECONNABORTED)) { - set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - rxrpc_queue_call(call); - } - goto free_packet; - - case RXRPC_PACKET_TYPE_BUSY: - _proto("Rx BUSY %%%u", sp->hdr.serial); - - if (rxrpc_is_service_call(call)) - goto protocol_error; + offset = sp->offset + nr_acks + 3; + if (skb->data_len >= offset + sizeof(buf.info)) { + if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0) + return rxrpc_proto_abort("XAI", call, 0); + rxrpc_input_ackinfo(call, skb, &buf.info); + } - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - __rxrpc_set_call_completion(call, - RXRPC_CALL_SERVER_BUSY, - 0, EBUSY); - set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); - rxrpc_queue_call(call); - case RXRPC_CALL_SERVER_BUSY: - goto free_packet_unlock; - default: - goto protocol_error_locked; - } + if (first_soft_ack == 0) + return rxrpc_proto_abort("AK0", call, 0); + /* Ignore ACKs unless we are or have just been transmitting. */ + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_SERVER_SEND_REPLY: + case RXRPC_CALL_SERVER_AWAIT_ACK: + break; default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); - goto protocol_error; - - case RXRPC_PACKET_TYPE_DATA: - _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - - if (sp->hdr.seq == 0) - goto protocol_error; - - call->ackr_prev_seq = sp->hdr.seq; + return; + } - /* received data implicitly ACKs all of the request packets we - * sent when we're acting as a client */ - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - rxrpc_assume_implicit_ackall(call, sp->hdr.serial); + /* Discard any out-of-order or duplicate ACKs. */ + if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) { + _debug("discard ACK %d <= %d", + sp->hdr.serial, call->acks_latest); + return; + } + call->acks_latest = sp->hdr.serial; - switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { - case 0: - skb = NULL; - goto done; + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && + hard_ack == call->tx_top) { + rxrpc_end_tx_phase(call, "ETA"); + return; + } - default: - BUG(); + if (before(hard_ack, call->tx_hard_ack) || + after(hard_ack, call->tx_top)) + return rxrpc_proto_abort("AKW", call, 0); - /* data packet received beyond the last packet */ - case -EBADMSG: - goto protocol_error; - } + if (after(hard_ack, call->tx_hard_ack)) + rxrpc_rotate_tx_window(call, hard_ack); - case RXRPC_PACKET_TYPE_ACKALL: - case RXRPC_PACKET_TYPE_ACK: - /* ACK processing is done in process context */ - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - skb_queue_tail(&call->rx_queue, skb); - rxrpc_queue_call(call); - skb = NULL; - } - read_unlock_bh(&call->state_lock); - goto free_packet; - } + if (after(first_soft_ack, call->tx_top)) + return; -protocol_error: - _debug("protocol error"); - write_lock_bh(&call->state_lock); -protocol_error_locked: - if (__rxrpc_abort_call("FPR", call, 0, RX_PROTOCOL_ERROR, EPROTO)) - rxrpc_queue_call(call); -free_packet_unlock: - write_unlock_bh(&call->state_lock); -free_packet: - rxrpc_free_skb(skb); -done: - _leave(""); + if (nr_acks > call->tx_top - first_soft_ack + 1) + nr_acks = first_soft_ack - call->tx_top + 1; + if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0) + return rxrpc_proto_abort("XSA", call, 0); + rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks); } /* - * split up a jumbo data packet + * Process an ACKALL packet. */ -static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, - struct sk_buff *jumbo) +static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) { - struct rxrpc_jumbo_header jhdr; - struct rxrpc_skb_priv *sp; - struct sk_buff *part; - - _enter(",{%u,%u}", jumbo->data_len, jumbo->len); - - sp = rxrpc_skb(jumbo); - - do { - sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; - - /* make a clone to represent the first subpacket in what's left - * of the jumbo packet */ - part = skb_clone(jumbo, GFP_ATOMIC); - if (!part) { - /* simply ditch the tail in the event of ENOMEM */ - pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); - break; - } - rxrpc_new_skb(part); - - pskb_trim(part, RXRPC_JUMBO_DATALEN); - - if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) - goto protocol_error; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) - goto protocol_error; - if (!pskb_pull(jumbo, sizeof(jhdr))) - BUG(); + _proto("Rx ACKALL %%%u", sp->hdr.serial); - sp->hdr.seq += 1; - sp->hdr.serial += 1; - sp->hdr.flags = jhdr.flags; - sp->hdr._rsvd = ntohs(jhdr._rsvd); + rxrpc_end_tx_phase(call, "ETL"); +} - _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); +/* + * Process an ABORT packet. + */ +static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 wtmp; + u32 abort_code = RX_CALL_DEAD; - rxrpc_fast_process_packet(call, part); - part = NULL; + _enter(""); - } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); + if (skb->len >= 4 && + skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) >= 0) + abort_code = ntohl(wtmp); - rxrpc_fast_process_packet(call, jumbo); - _leave(""); - return; + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); -protocol_error: - _debug("protocol error"); - rxrpc_free_skb(part); - if (rxrpc_abort_call("PJP", call, sp->hdr.seq, - RX_PROTOCOL_ERROR, EPROTO)) - rxrpc_queue_call(call); - rxrpc_free_skb(jumbo); - _leave(""); + if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + abort_code, ECONNABORTED)) + rxrpc_notify_socket(call); } /* - * post an incoming packet to the appropriate call/socket to deal with - * - must get rid of the sk_buff, either by freeing it or by queuing it + * Process an incoming call packet. */ -static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, - struct rxrpc_call *call, - struct sk_buff *skb) +static void rxrpc_input_call_packet(struct rxrpc_call *call, + struct sk_buff *skb, u16 skew) { - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); _enter("%p,%p", call, skb); - sp = rxrpc_skb(skb); - - _debug("extant call [%d]", call->state); - - read_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_COMPLETE: - switch (call->completion) { - case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, - &call->events)) { - rxrpc_queue_call(call); - goto free_unlock; - } - default: - goto dead_call; - case RXRPC_CALL_SUCCEEDED: - if (rxrpc_is_service_call(call)) - goto dead_call; - goto resend_final_ack; - } - - case RXRPC_CALL_CLIENT_FINAL_ACK: - goto resend_final_ack; + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_DATA: + rxrpc_input_data(call, skb, skew); + break; - default: + case RXRPC_PACKET_TYPE_ACK: + rxrpc_input_ack(call, skb, skew); break; - } - read_unlock(&call->state_lock); + case RXRPC_PACKET_TYPE_BUSY: + _proto("Rx BUSY %%%u", sp->hdr.serial); - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.flags & RXRPC_JUMBO_PACKET) - rxrpc_process_jumbo_packet(call, skb); - else - rxrpc_fast_process_packet(call, skb); + /* Just ignore BUSY packets from the server; the retry and + * lifespan timers will take care of business. BUSY packets + * from the client don't make sense. + */ + break; - goto done; + case RXRPC_PACKET_TYPE_ABORT: + rxrpc_input_abort(call, skb); + break; -resend_final_ack: - _debug("final ack again"); - set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - rxrpc_queue_call(call); - goto free_unlock; + case RXRPC_PACKET_TYPE_ACKALL: + rxrpc_input_ackall(call, skb); + break; -dead_call: - if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { - skb->priority = RX_CALL_DEAD; - rxrpc_reject_packet(conn->params.local, skb); - goto unlock; + default: + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); + break; } -free_unlock: - rxrpc_free_skb(skb); -unlock: - read_unlock(&call->state_lock); -done: + _leave(""); } @@ -600,6 +598,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, rxrpc_queue_local(local); } +/* + * put a packet up for transport-level abort + */ +static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + CHECK_SLAB_OKAY(&local->usage); + + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_local(local); +} + /* * Extract the wire header from a packet and translate the byte order. */ @@ -611,8 +620,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) /* dig out the RxRPC connection details */ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) return -EBADMSG; - if (!pskb_pull(skb, sizeof(whdr))) - BUG(); memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); @@ -626,6 +633,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); + sp->offset = sizeof(whdr); return 0; } @@ -637,19 +645,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) * shut down and the local endpoint from going away, thus sk_user_data will not * be cleared until this function returns. */ -void rxrpc_data_ready(struct sock *sk) +void rxrpc_data_ready(struct sock *udp_sk) { struct rxrpc_connection *conn; + struct rxrpc_channel *chan; + struct rxrpc_call *call; struct rxrpc_skb_priv *sp; - struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_local *local = udp_sk->sk_user_data; struct sk_buff *skb; + unsigned int channel; int ret, skew; - _enter("%p", sk); + _enter("%p", udp_sk); ASSERT(!irqs_disabled()); - skb = skb_recv_datagram(sk, 0, 1, &ret); + skb = skb_recv_datagram(udp_sk, 0, 1, &ret); if (!skb) { if (ret == -EAGAIN) return; @@ -695,111 +706,122 @@ void rxrpc_data_ready(struct sock *sk) goto bad_message; } - if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: rxrpc_post_packet_to_local(local, skb); goto out; - } - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) - goto bad_message; + case RXRPC_PACKET_TYPE_BUSY: + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + goto discard; + + case RXRPC_PACKET_TYPE_DATA: + if (sp->hdr.callNumber == 0) + goto bad_message; + if (sp->hdr.flags & RXRPC_JUMBO_PACKET && + !rxrpc_validate_jumbo(skb)) + goto bad_message; + break; + } rcu_read_lock(); conn = rxrpc_find_connection_rcu(local, skb); - if (!conn) { - skb->priority = 0; - goto cant_route_call; - } + if (conn) { + if (sp->hdr.securityIndex != conn->security_ix) + goto wrong_security; - /* Note the serial number skew here */ - skew = (int)sp->hdr.serial - (int)conn->hi_serial; - if (skew >= 0) { - if (skew > 0) - conn->hi_serial = sp->hdr.serial; - skb->priority = 0; - } else { - skew = -skew; - skb->priority = min(skew, 65535); - } + if (sp->hdr.callNumber == 0) { + /* Connection-level packet */ + _debug("CONN %p {%d}", conn, conn->debug_id); + rxrpc_post_packet_to_conn(conn, skb); + goto out_unlock; + } + + /* Note the serial number skew here */ + skew = (int)sp->hdr.serial - (int)conn->hi_serial; + if (skew >= 0) { + if (skew > 0) + conn->hi_serial = sp->hdr.serial; + } else { + skew = -skew; + skew = min(skew, 65535); + } - if (sp->hdr.callNumber == 0) { - /* Connection-level packet */ - _debug("CONN %p {%d}", conn, conn->debug_id); - rxrpc_post_packet_to_conn(conn, skb); - goto out_unlock; - } else { /* Call-bound packets are routed by connection channel. */ - unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK; - struct rxrpc_channel *chan = &conn->channels[channel]; - struct rxrpc_call *call; + channel = sp->hdr.cid & RXRPC_CHANNELMASK; + chan = &conn->channels[channel]; /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) goto discard_unlock; if (sp->hdr.callNumber == chan->last_call) { - /* For the previous service call, if completed - * successfully, we discard all further packets. + /* For the previous service call, if completed successfully, we + * discard all further packets. */ if (rxrpc_conn_is_service(conn) && (chan->last_type == RXRPC_PACKET_TYPE_ACK || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)) goto discard_unlock; - /* But otherwise we need to retransmit the final packet - * from data cached in the connection record. + /* But otherwise we need to retransmit the final packet from + * data cached in the connection record. */ rxrpc_post_packet_to_conn(conn, skb); goto out_unlock; } call = rcu_dereference(chan->call); - if (!call || atomic_read(&call->usage) == 0) - goto cant_route_call; + } else { + skew = 0; + call = NULL; + } - rxrpc_see_call(call); - rxrpc_post_packet_to_call(conn, call, skb); - goto out_unlock; + if (!call || atomic_read(&call->usage) == 0) { + if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) || + sp->hdr.callNumber == 0 || + sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + goto bad_message_unlock; + if (sp->hdr.seq != 1) + goto discard_unlock; + call = rxrpc_new_incoming_call(local, conn, skb); + if (!call) { + rcu_read_unlock(); + goto reject_packet; + } } + rxrpc_input_call_packet(call, skb, skew); + goto discard_unlock; + discard_unlock: - rxrpc_free_skb(skb); -out_unlock: rcu_read_unlock(); +discard: + rxrpc_free_skb(skb); out: trace_rxrpc_rx_done(0, 0); return; -cant_route_call: +out_unlock: rcu_read_unlock(); + goto out; - _debug("can't route call"); - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && - sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - if (sp->hdr.seq == 1) { - _debug("first packet"); - skb_queue_tail(&local->accept_queue, skb); - rxrpc_queue_work(&local->processor); - _leave(" [incoming]"); - goto out; - } - skb->priority = RX_INVALID_OPERATION; - } else { - skb->priority = RX_CALL_DEAD; - } - - if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { - _debug("reject type %d",sp->hdr.type); - goto reject_packet; - } else { - rxrpc_free_skb(skb); - } - _leave(" [no call]"); - return; +wrong_security: + rcu_read_unlock(); + trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RXKADINCONSISTENCY, EBADMSG); + skb->priority = RXKADINCONSISTENCY; + goto post_abort; +bad_message_unlock: + rcu_read_unlock(); bad_message: + trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_PROTOCOL_ERROR, EBADMSG); skb->priority = RX_PROTOCOL_ERROR; +post_abort: + skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; reject_packet: trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index a4aba0246731..7d4375e557e6 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -30,14 +30,18 @@ static int none_secure_packet(struct rxrpc_call *call, return 0; } -static int none_verify_packet(struct rxrpc_call *call, - struct sk_buff *skb, - rxrpc_seq_t seq, - u16 expected_cksum) +static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, + rxrpc_seq_t seq, u16 expected_cksum) { return 0; } +static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ +} + static int none_respond_to_challenge(struct rxrpc_connection *conn, struct sk_buff *skb, u32 *_abort_code) @@ -79,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = { .prime_packet_security = none_prime_packet_security, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, + .locate_data = none_locate_data, .respond_to_challenge = none_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index bcc6593b4cdb..cdd58e6e9fbd 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -98,7 +98,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (skb_copy_bits(skb, 0, &v, 1) < 0) + if (skb_copy_bits(skb, sp->offset, &v, 1) < 0) return; _proto("Rx VERSION { %02x }", v); if (v == 0) diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 610916f4ae34..782b9adf67cb 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -77,7 +77,6 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) INIT_WORK(&local->processor, rxrpc_local_processor); INIT_HLIST_HEAD(&local->services); init_rwsem(&local->defrag_sem); - skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); local->client_conns = RB_ROOT; @@ -308,7 +307,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) /* At this point, there should be no more packets coming in to the * local endpoint. */ - rxrpc_purge_queue(&local->accept_queue); rxrpc_purge_queue(&local->reject_queue); rxrpc_purge_queue(&local->event_queue); @@ -332,11 +330,6 @@ static void rxrpc_local_processor(struct work_struct *work) if (atomic_read(&local->usage) == 0) return rxrpc_local_destroyer(local); - if (!skb_queue_empty(&local->accept_queue)) { - rxrpc_accept_incoming_calls(local); - again = true; - } - if (!skb_queue_empty(&local->reject_queue)) { rxrpc_reject_packets(local); again = true; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 39e7cc37c392..fd096f742e4b 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -50,7 +50,7 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further * packets. */ -unsigned int rxrpc_rx_window_size = 32; +unsigned int rxrpc_rx_window_size = RXRPC_RXTX_BUFF_SIZE - 46; /* * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8756d74fd74b..719a4c23f09d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include "ar-internal.h" @@ -38,20 +40,38 @@ struct rxrpc_pkt_buffer { static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, struct rxrpc_pkt_buffer *pkt) { + rxrpc_seq_t hard_ack, top, seq; + int ix; u32 mtu, jmax; u8 *ackp = pkt->acks; + /* Barrier against rxrpc_input_data(). */ + hard_ack = READ_ONCE(call->rx_hard_ack); + top = smp_load_acquire(&call->rx_top); + pkt->ack.bufferSpace = htons(8); - pkt->ack.maxSkew = htons(0); - pkt->ack.firstPacket = htonl(call->rx_data_eaten + 1); + pkt->ack.maxSkew = htons(call->ackr_skew); + pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); pkt->ack.serial = htonl(call->ackr_serial); - pkt->ack.reason = RXRPC_ACK_IDLE; - pkt->ack.nAcks = 0; + pkt->ack.reason = call->ackr_reason; + pkt->ack.nAcks = top - hard_ack; + + if (after(top, hard_ack)) { + seq = hard_ack + 1; + do { + ix = seq & RXRPC_RXTX_BUFF_MASK; + if (call->rxtx_buffer[ix]) + *ackp++ = RXRPC_ACK_TYPE_ACK; + else + *ackp++ = RXRPC_ACK_TYPE_NACK; + seq++; + } while (before_eq(seq, top)); + } - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; + mtu = call->conn->params.peer->if_mtu; + mtu -= call->conn->params.peer->hdrsize; + jmax = (call->nr_jumbo_dup > 3) ? 1 : rxrpc_rx_jumbo_max; pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); pkt->ackinfo.maxMTU = htonl(mtu); pkt->ackinfo.rwind = htonl(rxrpc_rx_window_size); @@ -60,11 +80,11 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; - return 3; + return top - hard_ack + 3; } /* - * Send a final ACK or ABORT call packet. + * Send an ACK or ABORT call packet. */ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) { @@ -158,6 +178,19 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); + if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) { + switch (pkt->whdr.type) { + case RXRPC_PACKET_TYPE_ACK: + rxrpc_propose_ACK(call, pkt->ack.reason, + ntohs(pkt->ack.maxSkew), + ntohl(pkt->ack.serial), + true, true); + break; + case RXRPC_PACKET_TYPE_ABORT: + break; + } + } + out: rxrpc_put_connection(conn); kfree(pkt); @@ -233,3 +266,77 @@ send_fragmentable: _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata); return ret; } + +/* + * reject packets through the local endpoint + */ +void rxrpc_reject_packets(struct rxrpc_local *local) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + } sa; + struct rxrpc_skb_priv *sp; + struct rxrpc_wire_header whdr; + struct sk_buff *skb; + struct msghdr msg; + struct kvec iov[2]; + size_t size; + __be32 code; + + _enter("%d", local->debug_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &code; + iov[1].iov_len = sizeof(code); + size = sizeof(whdr) + sizeof(code); + + msg.msg_name = &sa; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa.sa_family = local->srx.transport.family; + switch (sa.sa.sa_family) { + case AF_INET: + msg.msg_namelen = sizeof(sa.sin); + break; + default: + msg.msg_namelen = 0; + break; + } + + memset(&whdr, 0, sizeof(whdr)); + whdr.type = RXRPC_PACKET_TYPE_ABORT; + + while ((skb = skb_dequeue(&local->reject_queue))) { + rxrpc_see_skb(skb); + sp = rxrpc_skb(skb); + switch (sa.sa.sa_family) { + case AF_INET: + sa.sin.sin_port = udp_hdr(skb)->source; + sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + code = htonl(skb->priority); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; + + kernel_sendmsg(local->socket, &msg, iov, 2, size); + break; + + default: + break; + } + + rxrpc_free_skb(skb); + } + + _leave(""); +} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 27b9ecad007e..c8948936c6fc 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -129,15 +129,14 @@ void rxrpc_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } + rxrpc_new_skb(skb); serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); - kfree_skb(skb); + rxrpc_free_skb(skb); return; } - rxrpc_new_skb(skb); - rcu_read_lock(); peer = rxrpc_lookup_peer_icmp_rcu(local, skb); if (peer && !rxrpc_get_peer_maybe(peer)) @@ -249,7 +248,6 @@ void rxrpc_peer_error_distributor(struct work_struct *work) container_of(work, struct rxrpc_peer, error_distributor); struct rxrpc_call *call; enum rxrpc_call_completion compl; - bool queue; int error; _enter(""); @@ -272,15 +270,8 @@ void rxrpc_peer_error_distributor(struct work_struct *work) hlist_del_init(&call->error_link); rxrpc_see_call(call); - queue = false; - write_lock(&call->state_lock); - if (__rxrpc_set_call_completion(call, compl, 0, error)) { - set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - queue = true; - } - write_unlock(&call->state_lock); - if (queue) - rxrpc_queue_call(call); + if (rxrpc_set_call_completion(call, compl, 0, error)) + rxrpc_notify_socket(call); } spin_unlock_bh(&peer->lock); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index aebc73ac16dc..2efe29a4c232 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -198,6 +198,32 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) return peer; } +/* + * Initialise peer record. + */ +static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key) +{ + rxrpc_assess_MTU_size(peer); + peer->mtu = peer->if_mtu; + + if (peer->srx.transport.family == AF_INET) { + peer->hdrsize = sizeof(struct iphdr); + switch (peer->srx.transport_type) { + case SOCK_DGRAM: + peer->hdrsize += sizeof(struct udphdr); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + + peer->hdrsize += sizeof(struct rxrpc_wire_header); + peer->maxdata = peer->mtu - peer->hdrsize; +} + /* * Set up a new peer. */ @@ -214,29 +240,39 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, if (peer) { peer->hash_key = hash_key; memcpy(&peer->srx, srx, sizeof(*srx)); + rxrpc_init_peer(peer, hash_key); + } - rxrpc_assess_MTU_size(peer); - peer->mtu = peer->if_mtu; - - if (srx->transport.family == AF_INET) { - peer->hdrsize = sizeof(struct iphdr); - switch (srx->transport_type) { - case SOCK_DGRAM: - peer->hdrsize += sizeof(struct udphdr); - break; - default: - BUG(); - break; - } - } else { - BUG(); - } + _leave(" = %p", peer); + return peer; +} - peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; +/* + * Set up a new incoming peer. The address is prestored in the preallocated + * peer. + */ +struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, + struct rxrpc_peer *prealloc) +{ + struct rxrpc_peer *peer; + unsigned long hash_key; + + hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); + prealloc->local = local; + rxrpc_init_peer(prealloc, hash_key); + + spin_lock(&rxrpc_peer_hash_lock); + + /* Need to check that we aren't racing with someone else */ + peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + peer = prealloc; + hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key); } - _leave(" = %p", peer); + spin_unlock(&rxrpc_peer_hash_lock); return peer; } @@ -272,7 +308,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxrpc_peer_hash_lock); + spin_lock_bh(&rxrpc_peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -282,7 +318,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, hash_add_rcu(rxrpc_peer_hash, &candidate->hash_link, hash_key); - spin_unlock(&rxrpc_peer_hash_lock); + spin_unlock_bh(&rxrpc_peer_hash_lock); if (peer) kfree(candidate); @@ -307,9 +343,9 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer) { ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxrpc_peer_hash_lock); + spin_lock_bh(&rxrpc_peer_hash_lock); hash_del_rcu(&peer->hash_link); - spin_unlock(&rxrpc_peer_hash_lock); + spin_unlock_bh(&rxrpc_peer_hash_lock); kfree_rcu(peer, rcu); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 6876ffb3b410..20d0b5c6f81b 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -19,319 +19,479 @@ #include "ar-internal.h" /* - * receive a message from an RxRPC socket - * - we need to be careful about two or more threads calling recvmsg - * simultaneously + * Post a call for attention by the socket or kernel service. Further + * notifications are suppressed by putting recvmsg_link on a dummy queue. */ -int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +void rxrpc_notify_socket(struct rxrpc_call *call) { - struct rxrpc_skb_priv *sp; - struct rxrpc_call *call = NULL, *continue_call = NULL; - struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - struct sk_buff *skb; - long timeo; - int copy, ret, ullen, offset, copied = 0; - u32 abort_code; + struct rxrpc_sock *rx; + struct sock *sk; - DEFINE_WAIT(wait); + _enter("%d", call->debug_id); - _enter(",,,%zu,%d", len, flags); + if (!list_empty(&call->recvmsg_link)) + return; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + sk = &rx->sk; + if (rx && sk->sk_state < RXRPC_CLOSE) { + if (call->notify_rx) { + call->notify_rx(sk, call, call->user_call_ID); + } else { + write_lock_bh(&rx->recvmsg_lock); + if (list_empty(&call->recvmsg_link)) { + rxrpc_get_call(call, rxrpc_call_got); + list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); + } + write_unlock_bh(&rx->recvmsg_lock); - if (flags & (MSG_OOB | MSG_TRUNC)) - return -EOPNOTSUPP; + if (!sock_flag(sk, SOCK_DEAD)) { + _debug("call %ps", sk->sk_data_ready); + sk->sk_data_ready(sk); + } + } + } - ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); + rcu_read_unlock(); + _leave(""); +} - timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); - msg->msg_flags |= MSG_MORE; +/* + * Pass a call terminating message to userspace. + */ +static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) +{ + u32 tmp = 0; + int ret; - lock_sock(&rx->sk); + switch (call->completion) { + case RXRPC_CALL_SUCCEEDED: + ret = 0; + if (rxrpc_is_service_call(call)) + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp); + break; + case RXRPC_CALL_REMOTELY_ABORTED: + tmp = call->abort_code; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); + break; + case RXRPC_CALL_LOCALLY_ABORTED: + tmp = call->abort_code; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); + break; + case RXRPC_CALL_NETWORK_ERROR: + tmp = call->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp); + break; + case RXRPC_CALL_LOCAL_ERROR: + tmp = call->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); + break; + default: + pr_err("Invalid terminal call state %u\n", call->state); + BUG(); + break; + } - for (;;) { - /* return immediately if a client socket has no outstanding - * calls */ - if (RB_EMPTY_ROOT(&rx->calls)) { - if (copied) - goto out; - if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { - release_sock(&rx->sk); - if (continue_call) - rxrpc_put_call(continue_call, - rxrpc_call_put); - return -ENODATA; - } - } + return ret; +} - /* get the next message on the Rx queue */ - skb = skb_peek(&rx->sk.sk_receive_queue); - if (!skb) { - /* nothing remains on the queue */ - if (copied && - (flags & MSG_PEEK || timeo == 0)) - goto out; +/* + * Pass back notification of a new call. The call is added to the + * to-be-accepted list. This means that the next call to be accepted might not + * be the last call seen awaiting acceptance, but unless we leave this on the + * front of the queue and block all other messages until someone gives us a + * user_ID for it, there's not a lot we can do. + */ +static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, int flags) +{ + int tmp = 0, ret; - /* wait for a message to turn up */ - release_sock(&rx->sk); - prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, - TASK_INTERRUPTIBLE); - ret = sock_error(&rx->sk); - if (ret) - goto wait_error; - - if (skb_queue_empty(&rx->sk.sk_receive_queue)) { - if (signal_pending(current)) - goto wait_interrupted; - timeo = schedule_timeout(timeo); - } - finish_wait(sk_sleep(&rx->sk), &wait); - lock_sock(&rx->sk); - continue; - } + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp); - peek_next_packet: - rxrpc_see_skb(skb); - sp = rxrpc_skb(skb); - call = sp->call; - ASSERT(call != NULL); - rxrpc_see_call(call); - - _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); - - /* make sure we wait for the state to be updated in this call */ - spin_lock_bh(&call->lock); - spin_unlock_bh(&call->lock); - - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { - _debug("packet from released call"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - continue; - } + if (ret == 0 && !(flags & MSG_PEEK)) { + _debug("to be accepted"); + write_lock_bh(&rx->recvmsg_lock); + list_del_init(&call->recvmsg_link); + write_unlock_bh(&rx->recvmsg_lock); - /* determine whether to continue last data receive */ - if (continue_call) { - _debug("maybe cont"); - if (call != continue_call || - skb->mark != RXRPC_SKB_MARK_DATA) { - release_sock(&rx->sk); - rxrpc_put_call(continue_call, rxrpc_call_put); - _leave(" = %d [noncont]", copied); - return copied; - } - } + write_lock(&rx->call_lock); + list_add_tail(&call->accept_link, &rx->to_be_accepted); + write_unlock(&rx->call_lock); + } - rxrpc_get_call(call, rxrpc_call_got); + return ret; +} - /* copy the peer address and timestamp */ - if (!continue_call) { - if (msg->msg_name) { - size_t len = - sizeof(call->conn->params.peer->srx); - memcpy(msg->msg_name, - &call->conn->params.peer->srx, len); - msg->msg_namelen = len; - } - sock_recv_timestamp(msg, &rx->sk, skb); - } +/* + * End the packet reception phase. + */ +static void rxrpc_end_rx_phase(struct rxrpc_call *call) +{ + _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); - /* receive the message */ - if (skb->mark != RXRPC_SKB_MARK_DATA) - goto receive_non_data_message; + if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + } else { + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, false); + } - _debug("recvmsg DATA #%u { %d, %d }", - sp->hdr.seq, skb->len, sp->offset); + write_lock_bh(&call->state_lock); - if (!continue_call) { - /* only set the control data once per recvmsg() */ - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - ullen, &call->user_call_ID); - if (ret < 0) - goto copy_error; - ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - } + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + __rxrpc_call_completed(call); + break; - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - call->rx_data_recv = sp->hdr.seq; + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + break; + default: + break; + } - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + write_unlock_bh(&call->state_lock); +} - offset = sp->offset; - copy = skb->len - offset; - if (copy > len - copied) - copy = len - copied; +/* + * Discard a packet we've used up and advance the Rx window by one. + */ +static void rxrpc_rotate_rx_window(struct rxrpc_call *call) +{ + struct sk_buff *skb; + rxrpc_seq_t hard_ack, top; + int ix; + + _enter("%d", call->debug_id); + + hard_ack = call->rx_hard_ack; + top = smp_load_acquire(&call->rx_top); + ASSERT(before(hard_ack, top)); + + hard_ack++; + ix = hard_ack & RXRPC_RXTX_BUFF_MASK; + skb = call->rxtx_buffer[ix]; + rxrpc_see_skb(skb); + call->rxtx_buffer[ix] = NULL; + call->rxtx_annotations[ix] = 0; + /* Barrier against rxrpc_input_data(). */ + smp_store_release(&call->rx_hard_ack, hard_ack); - ret = skb_copy_datagram_msg(skb, offset, msg, copy); + rxrpc_free_skb(skb); + _debug("%u,%u,%lx", hard_ack, top, call->flags); + if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags)) + rxrpc_end_rx_phase(call); +} + +/* + * Decrypt and verify a (sub)packet. The packet's length may be changed due to + * padding, but if this is the case, the packet length will be resident in the + * socket buffer. Note that we can't modify the master skb info as the skb may + * be the home to multiple subpackets. + */ +static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + u8 annotation, + unsigned int offset, unsigned int len) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_seq_t seq = sp->hdr.seq; + u16 cksum = sp->hdr.cksum; + + _enter(""); + + /* For all but the head jumbo subpacket, the security checksum is in a + * jumbo header immediately prior to the data. + */ + if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) { + __be16 tmp; + if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0) + BUG(); + cksum = ntohs(tmp); + seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1; + } + + return call->conn->security->verify_packet(call, skb, offset, len, + seq, cksum); +} + +/* + * Locate the data within a packet. This is complicated by: + * + * (1) An skb may contain a jumbo packet - so we have to find the appropriate + * subpacket. + * + * (2) The (sub)packets may be encrypted and, if so, the encrypted portion + * contains an extra header which includes the true length of the data, + * excluding any encrypted padding. + */ +static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, + u8 *_annotation, + unsigned int *_offset, unsigned int *_len) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int offset = *_offset; + unsigned int len = *_len; + int ret; + u8 annotation = *_annotation; + + if (offset > 0) + return 0; + + /* Locate the subpacket */ + offset = sp->offset; + len = skb->len - sp->offset; + if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) { + offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) * + RXRPC_JUMBO_SUBPKTLEN); + len = (annotation & RXRPC_RX_ANNO_JLAST) ? + skb->len - offset : RXRPC_JUMBO_SUBPKTLEN; + } + + if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) { + ret = rxrpc_verify_packet(call, skb, annotation, offset, len); if (ret < 0) - goto copy_error; + return ret; + *_annotation |= RXRPC_RX_ANNO_VERIFIED; + } - /* handle piecemeal consumption of data packets */ - _debug("copied %d+%d", copy, copied); + *_offset = offset; + *_len = len; + call->conn->security->locate_data(call, skb, _offset, _len); + return 0; +} - offset += copy; - copied += copy; +/* + * Deliver messages to a call. This keeps processing packets until the buffer + * is filled and we find either more DATA (returns 0) or the end of the DATA + * (returns 1). If more packets are required, it returns -EAGAIN. + */ +static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, + struct msghdr *msg, struct iov_iter *iter, + size_t len, int flags, size_t *_offset) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + rxrpc_seq_t hard_ack, top, seq; + size_t remain; + bool last; + unsigned int rx_pkt_offset, rx_pkt_len; + int ix, copy, ret = 0; + + _enter(""); + + rx_pkt_offset = call->rx_pkt_offset; + rx_pkt_len = call->rx_pkt_len; + + /* Barriers against rxrpc_input_data(). */ + hard_ack = call->rx_hard_ack; + top = smp_load_acquire(&call->rx_top); + for (seq = hard_ack + 1; before_eq(seq, top); seq++) { + ix = seq & RXRPC_RXTX_BUFF_MASK; + skb = call->rxtx_buffer[ix]; + if (!skb) + break; + smp_rmb(); + rxrpc_see_skb(skb); + sp = rxrpc_skb(skb); - if (!(flags & MSG_PEEK)) - sp->offset = offset; + if (msg) + sock_recv_timestamp(msg, sock->sk, skb); + + ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], + &rx_pkt_offset, &rx_pkt_len); + _debug("recvmsg %x DATA #%u { %d, %d }", + sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); + + /* We have to handle short, empty and used-up DATA packets. */ + remain = len - *_offset; + copy = rx_pkt_len; + if (copy > remain) + copy = remain; + if (copy > 0) { + ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, + copy); + if (ret < 0) + goto out; + + /* handle piecemeal consumption of data packets */ + _debug("copied %d @%zu", copy, *_offset); + + rx_pkt_offset += copy; + rx_pkt_len -= copy; + *_offset += copy; + } - if (sp->offset < skb->len) { + if (rx_pkt_len > 0) { _debug("buffer full"); - ASSERTCMP(copied, ==, len); + ASSERTCMP(*_offset, ==, len); break; } - /* we transferred the whole data packet */ + /* The whole packet has been transferred. */ + last = sp->hdr.flags & RXRPC_LAST_PACKET; if (!(flags & MSG_PEEK)) - rxrpc_kernel_data_consumed(call, skb); - - if (sp->hdr.flags & RXRPC_LAST_PACKET) { - _debug("last"); - if (rxrpc_conn_is_client(call->conn)) { - /* last byte of reply received */ - ret = copied; - goto terminal_message; - } + rxrpc_rotate_rx_window(call); + rx_pkt_offset = 0; + rx_pkt_len = 0; - /* last bit of request received */ - if (!(flags & MSG_PEEK)) { - _debug("eat packet"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != - skb) - BUG(); - rxrpc_free_skb(skb); - } - msg->msg_flags &= ~MSG_MORE; - break; - } + ASSERTIFCMP(last, seq, ==, top); + } - /* move on to the next data message */ - _debug("next"); - if (!continue_call) - continue_call = sp->call; - else - rxrpc_put_call(call, rxrpc_call_put); - call = NULL; - - if (flags & MSG_PEEK) { - _debug("peek next"); - skb = skb->next; - if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) - break; - goto peek_next_packet; - } + if (after(seq, top)) { + ret = -EAGAIN; + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags)) + ret = 1; + } +out: + if (!(flags & MSG_PEEK)) { + call->rx_pkt_offset = rx_pkt_offset; + call->rx_pkt_len = rx_pkt_len; + } + _leave(" = %d [%u/%u]", ret, seq, top); + return ret; +} - _debug("eat packet"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); +/* + * Receive a message from an RxRPC socket + * - we need to be careful about two or more threads calling recvmsg + * simultaneously + */ +int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct rxrpc_call *call; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct list_head *l; + size_t copied = 0; + long timeo; + int ret; + + DEFINE_WAIT(wait); + + _enter(",,,%zu,%d", len, flags); + + if (flags & (MSG_OOB | MSG_TRUNC)) + return -EOPNOTSUPP; + + timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); + +try_again: + lock_sock(&rx->sk); + + /* Return immediately if a client socket has no outstanding calls */ + if (RB_EMPTY_ROOT(&rx->calls) && + list_empty(&rx->recvmsg_q) && + rx->sk.sk_state != RXRPC_SERVER_LISTENING) { + release_sock(&rx->sk); + return -ENODATA; } - /* end of non-terminal data packet reception for the moment */ - _debug("end rcv data"); -out: - release_sock(&rx->sk); - if (call) - rxrpc_put_call(call, rxrpc_call_put); - if (continue_call) - rxrpc_put_call(continue_call, rxrpc_call_put); - _leave(" = %d [data]", copied); - return copied; - - /* handle non-DATA messages such as aborts, incoming connections and - * final ACKs */ -receive_non_data_message: - _debug("non-data"); - - if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { - _debug("RECV NEW CALL"); - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); - if (ret < 0) - goto copy_error; - if (!(flags & MSG_PEEK)) { - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); + if (list_empty(&rx->recvmsg_q)) { + ret = -EWOULDBLOCK; + if (timeo == 0) + goto error_no_call; + + release_sock(&rx->sk); + + /* Wait for something to happen */ + prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, + TASK_INTERRUPTIBLE); + ret = sock_error(&rx->sk); + if (ret) + goto wait_error; + + if (list_empty(&rx->recvmsg_q)) { + if (signal_pending(current)) + goto wait_interrupted; + timeo = schedule_timeout(timeo); } - goto out; + finish_wait(sk_sleep(&rx->sk), &wait); + goto try_again; } - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - ullen, &call->user_call_ID); - if (ret < 0) - goto copy_error; - ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + /* Find the next call and dequeue it if we're not just peeking. If we + * do dequeue it, that comes with a ref that we will need to release. + */ + write_lock_bh(&rx->recvmsg_lock); + l = rx->recvmsg_q.next; + call = list_entry(l, struct rxrpc_call, recvmsg_link); + if (!(flags & MSG_PEEK)) + list_del_init(&call->recvmsg_link); + else + rxrpc_get_call(call, rxrpc_call_got); + write_unlock_bh(&rx->recvmsg_lock); - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: + _debug("recvmsg call %p", call); + + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - case RXRPC_SKB_MARK_FINAL_ACK: - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); - break; - case RXRPC_SKB_MARK_BUSY: - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = call->abort_code; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = call->abort_code; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); - if (call->error) { - abort_code = call->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, - &abort_code); + + if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), + &call->user_call_ID); } + if (ret < 0) + goto error; + } + + if (msg->msg_name) { + size_t len = sizeof(call->conn->params.peer->srx); + memcpy(msg->msg_name, &call->conn->params.peer->srx, len); + msg->msg_namelen = len; + } + + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); break; - case RXRPC_SKB_MARK_NET_ERROR: - _debug("RECV NET ERROR %d", sp->error); - abort_code = sp->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - _debug("RECV LOCAL ERROR %d", sp->error); - abort_code = sp->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, - &abort_code); + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, + flags, &copied); + if (ret == -EAGAIN) + ret = 0; break; default: - pr_err("Unknown packet mark %u\n", skb->mark); - BUG(); + ret = 0; break; } if (ret < 0) - goto copy_error; - -terminal_message: - _debug("terminal"); - msg->msg_flags &= ~MSG_MORE; - msg->msg_flags |= MSG_EOR; + goto error; - if (!(flags & MSG_PEEK)) { - _net("free terminal skb %p", skb); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - rxrpc_release_call(rx, call); + if (call->state == RXRPC_CALL_COMPLETE) { + ret = rxrpc_recvmsg_term(call, msg); + if (ret < 0) + goto error; + if (!(flags & MSG_PEEK)) + rxrpc_release_call(rx, call); + msg->msg_flags |= MSG_EOR; + ret = 1; } - release_sock(&rx->sk); - rxrpc_put_call(call, rxrpc_call_put); - if (continue_call) - rxrpc_put_call(continue_call, rxrpc_call_put); - _leave(" = %d", ret); - return ret; + if (ret == 0) + msg->msg_flags |= MSG_MORE; + else + msg->msg_flags &= ~MSG_MORE; + ret = copied; -copy_error: - _debug("copy error"); - release_sock(&rx->sk); +error: rxrpc_put_call(call, rxrpc_call_put); - if (continue_call) - rxrpc_put_call(continue_call, rxrpc_call_put); +error_no_call: + release_sock(&rx->sk); _leave(" = %d", ret); return ret; @@ -339,85 +499,8 @@ wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); - if (continue_call) - rxrpc_put_call(continue_call, rxrpc_call_put); - if (copied) - copied = ret; - _leave(" = %d [waitfail %d]", copied, ret); - return copied; - -} - -/* - * Deliver messages to a call. This keeps processing packets until the buffer - * is filled and we find either more DATA (returns 0) or the end of the DATA - * (returns 1). If more packets are required, it returns -EAGAIN. - * - * TODO: Note that this is hacked in at the moment and will be replaced. - */ -static int temp_deliver_data(struct socket *sock, struct rxrpc_call *call, - struct iov_iter *iter, size_t size, - size_t *_offset) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - size_t remain; - int ret, copy; - - _enter("%d", call->debug_id); - -next: - local_bh_disable(); - skb = skb_dequeue(&call->knlrecv_queue); - local_bh_enable(); - if (!skb) { - if (test_bit(RXRPC_CALL_RX_NO_MORE, &call->flags)) - return 1; - _leave(" = -EAGAIN [empty]"); - return -EAGAIN; - } - - sp = rxrpc_skb(skb); - _debug("dequeued %p %u/%zu", skb, sp->offset, size); - - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - remain = size - *_offset; - if (remain > 0) { - copy = skb->len - sp->offset; - if (copy > remain) - copy = remain; - ret = skb_copy_datagram_iter(skb, sp->offset, iter, - copy); - if (ret < 0) - goto requeue_and_leave; - - /* handle piecemeal consumption of data packets */ - sp->offset += copy; - *_offset += copy; - } - - if (sp->offset < skb->len) - goto partially_used_skb; - - /* We consumed the whole packet */ - ASSERTCMP(sp->offset, ==, skb->len); - if (sp->hdr.flags & RXRPC_LAST_PACKET) - set_bit(RXRPC_CALL_RX_NO_MORE, &call->flags); - rxrpc_kernel_data_consumed(call, skb); - rxrpc_free_skb(skb); - goto next; - - default: - rxrpc_free_skb(skb); - goto next; - } - -partially_used_skb: - ASSERTCMP(*_offset, ==, size); - ret = 0; -requeue_and_leave: - skb_queue_head(&call->knlrecv_queue, skb); + release_sock(&rx->sk); + _leave(" = %d [wait]", ret); return ret; } @@ -453,8 +536,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct kvec iov; int ret; - _enter("{%d,%s},%zu,%d", - call->debug_id, rxrpc_call_states[call->state], size, want_more); + _enter("{%d,%s},%zu/%zu,%d", + call->debug_id, rxrpc_call_states[call->state], + *_offset, size, want_more); ASSERTCMP(*_offset, <=, size); ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); @@ -469,7 +553,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = temp_deliver_data(sock, call, &iter, size, _offset); + ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0, + _offset); if (ret < 0) goto out; @@ -494,7 +579,6 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, goto call_complete; default: - *_offset = 0; ret = -EINPROGRESS; goto out; } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 3777432df10b..ae392558829d 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -317,6 +317,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, * decrypt partial encryption on a packet (level 1 security) */ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, rxrpc_seq_t seq) { struct rxkad_level1_hdr sechdr; @@ -330,18 +331,20 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, _enter(""); - if (skb->len < 8) { + if (len < 8) { rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO); goto protocol_error; } - /* we want to decrypt the skbuff in-place */ + /* Decrypt the skbuff in-place. TODO: We really want to decrypt + * directly into the target buffer. + */ nsg = skb_cow_data(skb, 0, &trailer); if (nsg < 0 || nsg > 16) goto nomem; sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, 0, 8); + skb_to_sgvec(skb, sg, offset, 8); /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); @@ -353,12 +356,12 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_zero(req); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) { + if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO); goto protocol_error; } - if (!skb_pull(skb, sizeof(sechdr))) - BUG(); + offset += sizeof(sechdr); + len -= sizeof(sechdr); buf = ntohl(sechdr.data_size); data_size = buf & 0xffff; @@ -371,18 +374,16 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, goto protocol_error; } - /* shorten the packet to remove the padding */ - if (data_size > skb->len) { + if (data_size > len) { rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO); goto protocol_error; } - if (data_size < skb->len) - skb->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; protocol_error: + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); _leave(" = -EPROTO"); return -EPROTO; @@ -395,6 +396,7 @@ nomem: * wholly decrypt a packet (level 2 security) */ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, rxrpc_seq_t seq) { const struct rxrpc_key_token *token; @@ -409,12 +411,14 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, _enter(",{%d}", skb->len); - if (skb->len < 8) { + if (len < 8) { rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO); goto protocol_error; } - /* we want to decrypt the skbuff in-place */ + /* Decrypt the skbuff in-place. TODO: We really want to decrypt + * directly into the target buffer. + */ nsg = skb_cow_data(skb, 0, &trailer); if (nsg < 0) goto nomem; @@ -427,7 +431,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, 0, skb->len); + skb_to_sgvec(skb, sg, offset, len); /* decrypt from the session key */ token = call->conn->params.key->payload.data[0]; @@ -435,19 +439,19 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x); + skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (sg != _sg) kfree(sg); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) { + if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO); goto protocol_error; } - if (!skb_pull(skb, sizeof(sechdr))) - BUG(); + offset += sizeof(sechdr); + len -= sizeof(sechdr); buf = ntohl(sechdr.data_size); data_size = buf & 0xffff; @@ -460,17 +464,16 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, goto protocol_error; } - if (data_size > skb->len) { + if (data_size > len) { rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO); goto protocol_error; } - if (data_size < skb->len) - skb->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; protocol_error: + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); _leave(" = -EPROTO"); return -EPROTO; @@ -484,6 +487,7 @@ nomem: * jumbo packet). */ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, rxrpc_seq_t seq, u16 expected_cksum) { SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); @@ -521,6 +525,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, if (cksum != expected_cksum) { rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); _leave(" = -EPROTO [csum failed]"); return -EPROTO; } @@ -529,14 +534,60 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, case RXRPC_SECURITY_PLAIN: return 0; case RXRPC_SECURITY_AUTH: - return rxkad_verify_packet_1(call, skb, seq); + return rxkad_verify_packet_1(call, skb, offset, len, seq); case RXRPC_SECURITY_ENCRYPT: - return rxkad_verify_packet_2(call, skb, seq); + return rxkad_verify_packet_2(call, skb, offset, len, seq); default: return -ENOANO; } } +/* + * Locate the data contained in a packet that was partially encrypted. + */ +static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ + struct rxkad_level1_hdr sechdr; + + if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0) + BUG(); + *_offset += sizeof(sechdr); + *_len = ntohl(sechdr.data_size) & 0xffff; +} + +/* + * Locate the data contained in a packet that was completely encrypted. + */ +static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ + struct rxkad_level2_hdr sechdr; + + if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0) + BUG(); + *_offset += sizeof(sechdr); + *_len = ntohl(sechdr.data_size) & 0xffff; +} + +/* + * Locate the data contained in an already decrypted packet. + */ +static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ + switch (call->conn->params.security_level) { + case RXRPC_SECURITY_AUTH: + rxkad_locate_data_1(call, skb, _offset, _len); + return; + case RXRPC_SECURITY_ENCRYPT: + rxkad_locate_data_2(call, skb, _offset, _len); + return; + default: + return; + } +} + /* * issue a challenge */ @@ -704,7 +755,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, struct rxkad_challenge challenge; struct rxkad_response resp __attribute__((aligned(8))); /* must be aligned for crypto */ - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); u32 version, nonce, min_level, abort_code; int ret; @@ -722,8 +773,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, } abort_code = RXKADPACKETSHORT; - sp = rxrpc_skb(skb); - if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0) + if (skb_copy_bits(skb, sp->offset, &challenge, sizeof(challenge)) < 0) goto protocol_error; version = ntohl(challenge.version); @@ -969,7 +1019,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, { struct rxkad_response response __attribute__((aligned(8))); /* must be aligned for crypto */ - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; time_t expiry; void *ticket; @@ -980,7 +1030,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0) + if (skb_copy_bits(skb, sp->offset, &response, sizeof(response)) < 0) goto protocol_error; if (!pskb_pull(skb, sizeof(response))) BUG(); @@ -988,7 +1038,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, version = ntohl(response.version); ticket_len = ntohl(response.ticket_len); kvno = ntohl(response.kvno); - sp = rxrpc_skb(skb); _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", sp->hdr.serial, version, kvno, ticket_len); @@ -1010,7 +1059,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, return -ENOMEM; abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0) + if (skb_copy_bits(skb, sp->offset, ticket, ticket_len) < 0) goto protocol_error_free; ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, @@ -1135,6 +1184,7 @@ const struct rxrpc_security rxkad = { .prime_packet_security = rxkad_prime_packet_security, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, + .locate_data = rxkad_locate_data, .issue_challenge = rxkad_issue_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 5d79d5a9c944..82d8134e9287 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) } /* find the service */ - read_lock_bh(&local->services_lock); + read_lock(&local->services_lock); hlist_for_each_entry(rx, &local->services, listen_link) { if (rx->srx.srx_service == conn->params.service_id) goto found_service; } /* the service appears to have died */ - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); _leave(" = -ENOENT"); return -ENOENT; found_service: if (!rx->securities) { - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); _leave(" = -ENOKEY"); return -ENOKEY; } @@ -152,13 +152,13 @@ found_service: kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc); if (IS_ERR(kref)) { - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); _leave(" = %ld [search]", PTR_ERR(kref)); return PTR_ERR(kref); } key = key_ref_to_ptr(kref); - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); conn->server_key = key; conn->security = sec; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9a4af992fcdf..cba236575073 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include "ar-internal.h" @@ -38,19 +37,20 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%d},%ld", - CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), - call->acks_winsz), - *timeo); + _enter(",{%u,%u,%u}", + call->tx_hard_ack, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); for (;;) { set_current_state(TASK_INTERRUPTIBLE); ret = 0; - if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), - call->acks_winsz) > 0) + if (call->tx_top - call->tx_hard_ack < call->tx_winsize) break; + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -call->error; + break; + } if (signal_pending(current)) { ret = sock_intr_errno(*timeo); break; @@ -68,36 +68,44 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, } /* - * attempt to schedule an instant Tx resend + * Schedule an instant Tx resend. */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call) +static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix) { - read_lock_bh(&call->state_lock); - if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + spin_lock_bh(&call->lock); + + if (call->state < RXRPC_CALL_COMPLETE) { + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); } - read_unlock_bh(&call->state_lock); + + spin_unlock_bh(&call->lock); } /* - * queue a packet for transmission, set the resend timer and attempt - * to send the packet immediately + * Queue a DATA packet for transmission, set the resend timeout and send the + * packet immediately */ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, bool last) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - int ret; + rxrpc_seq_t seq = sp->hdr.seq; + int ret, ix; + + _net("queue skb %p [%d]", skb, seq); - _net("queue skb %p [%d]", skb, call->acks_head); + ASSERTCMP(seq, ==, call->tx_top + 1); - ASSERT(call->acks_window != NULL); - call->acks_window[call->acks_head] = (unsigned long) skb; + ix = seq & RXRPC_RXTX_BUFF_MASK; + rxrpc_get_skb(skb); + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; smp_wmb(); - call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); + call->rxtx_buffer[ix] = skb; + call->tx_top = seq; + if (last) + set_bit(RXRPC_CALL_TX_LAST, &call->flags); if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); @@ -121,34 +129,17 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - sp->need_resend = false; - sp->resend_at = jiffies + rxrpc_resend_timeout; - if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { - _debug("run timer"); - call->resend_timer.expires = sp->resend_at; - add_timer(&call->resend_timer); - } - - /* attempt to cancel the rx-ACK timer, deferring reply transmission if - * we're ACK'ing the request phase of an incoming call */ - ret = -EAGAIN; - if (try_to_del_timer_sync(&call->ack_timer) >= 0) { - /* the packet may be freed by rxrpc_process_call() before this - * returns */ - if (rxrpc_is_client_call(call)) - rxrpc_expose_client_call(call); - ret = rxrpc_send_data_packet(call->conn, skb); - _net("sent skb %p", skb); - } else { - _debug("failed to delete ACK timer"); - } + if (seq == 1 && rxrpc_is_client_call(call)) + rxrpc_expose_client_call(call); + sp->resend_at = jiffies + rxrpc_resend_timeout; + ret = rxrpc_send_data_packet(call->conn, skb); if (ret < 0) { _debug("need instant resend %d", ret); - sp->need_resend = true; - rxrpc_instant_resend(call); + rxrpc_instant_resend(call, ix); } + rxrpc_free_skb(skb); _leave(""); } @@ -212,9 +203,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (CIRC_SPACE(call->acks_head, - ACCESS_ONCE(call->acks_tail), - call->acks_winsz) <= 0) { + if (call->tx_top - call->tx_hard_ack >= + call->tx_winsize) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; @@ -313,7 +303,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, memset(skb_put(skb, pad), 0, pad); } - seq = atomic_inc_return(&call->sequence); + seq = call->tx_top + 1; sp->hdr.epoch = conn->proto.epoch; sp->hdr.cid = call->cid; @@ -329,9 +319,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sp->hdr.flags = conn->out_clientflag; if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; - else if (CIRC_SPACE(call->acks_head, - ACCESS_ONCE(call->acks_tail), - call->acks_winsz) > 1) + else if (call->tx_top - call->tx_hard_ack < + call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; if (more && seq & 1) sp->hdr.flags |= RXRPC_REQUEST_ACK; @@ -358,7 +347,7 @@ out: call_terminated: rxrpc_free_skb(skb); _leave(" = %d", -call->error); - return ret; + return -call->error; maybe_error: if (copied) @@ -451,29 +440,6 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, return 0; } -/* - * abort a call, sending an ABORT packet to the peer - */ -static void rxrpc_send_abort(struct rxrpc_call *call, const char *why, - u32 abort_code, int error) -{ - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - write_lock_bh(&call->state_lock); - - if (__rxrpc_abort_call(why, call, 0, abort_code, error)) { - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - rxrpc_queue_call(call); - } - - write_unlock_bh(&call->state_lock); -} - /* * Create a new client call for sendmsg(). */ @@ -549,7 +515,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); } - rxrpc_see_call(call); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); @@ -557,8 +522,10 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* it's too late for this call */ ret = -ESHUTDOWN; } else if (cmd == RXRPC_CMD_SEND_ABORT) { - rxrpc_send_abort(call, "CMD", abort_code, ECONNABORTED); ret = 0; + if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED)) + ret = rxrpc_send_call_packet(call, + RXRPC_PACKET_TYPE_ABORT); } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else if (rxrpc_is_client_call(call) && @@ -639,7 +606,8 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, lock_sock(sock->sk); - rxrpc_send_abort(call, why, abort_code, error); + if (rxrpc_abort_call(why, call, 0, abort_code, error)) + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); release_sock(sock->sk); _leave(""); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 9b8f8456d3bf..620d9ccaf3c1 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -18,133 +18,6 @@ #include #include "ar-internal.h" -/* - * set up for the ACK at the end of the receive phase when we discard the final - * receive phase data packet - * - called with softirqs disabled - */ -static void rxrpc_request_final_ACK(struct rxrpc_call *call) -{ - /* the call may be aborted before we have a chance to ACK it */ - write_lock(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - call->state = RXRPC_CALL_CLIENT_FINAL_ACK; - _debug("request final ACK"); - - set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - if (try_to_del_timer_sync(&call->ack_timer) >= 0) - rxrpc_queue_call(call); - break; - - case RXRPC_CALL_SERVER_RECV_REQUEST: - call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - default: - break; - } - - write_unlock(&call->state_lock); -} - -/* - * drop the bottom ACK off of the call ACK window and advance the window - */ -static void rxrpc_hard_ACK_data(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - int loop; - u32 seq; - - spin_lock_bh(&call->lock); - - _debug("hard ACK #%u", sp->hdr.seq); - - for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { - call->ackr_window[loop] >>= 1; - call->ackr_window[loop] |= - call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); - } - - seq = sp->hdr.seq; - ASSERTCMP(seq, ==, call->rx_data_eaten + 1); - call->rx_data_eaten = seq; - - if (call->ackr_win_top < UINT_MAX) - call->ackr_win_top++; - - ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, - call->rx_data_post, >=, call->rx_data_recv); - ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, - call->rx_data_recv, >=, call->rx_data_eaten); - - if (sp->hdr.flags & RXRPC_LAST_PACKET) { - rxrpc_request_final_ACK(call); - } else if (atomic_dec_and_test(&call->ackr_not_idle) && - test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { - /* We previously soft-ACK'd some received packets that have now - * been consumed, so send a hard-ACK if no more packets are - * immediately forthcoming to allow the transmitter to free up - * its Tx bufferage. - */ - _debug("send Rx idle ACK"); - __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, - skb->priority, sp->hdr.serial, false); - } - - spin_unlock_bh(&call->lock); -} - -/** - * rxrpc_kernel_data_consumed - Record consumption of data message - * @call: The call to which the message pertains. - * @skb: Message holding data - * - * Record the consumption of a data message and generate an ACK if appropriate. - * The call state is shifted if this was the final packet. The caller must be - * in process context with no spinlocks held. - * - * TODO: Actually generate the ACK here rather than punting this to the - * workqueue. - */ -void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq); - - ASSERTCMP(sp->call, ==, call); - ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA); - - /* TODO: Fix the sequence number tracking */ - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); - - call->rx_data_recv = sp->hdr.seq; - rxrpc_hard_ACK_data(call, skb); -} - -/* - * Destroy a packet that has an RxRPC control buffer - */ -void rxrpc_packet_destructor(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call = sp->call; - - _enter("%p{%p}", skb, call); - - if (call) { - rxrpc_put_call_for_skb(call, skb); - sp->call = NULL; - } - - if (skb->sk) - sock_rfree(skb); - _leave(""); -} - /* * Note the existence of a new-to-us socket buffer (allocated or dequeued). */ -- cgit v1.2.3 From d545caca827b65aab557a9e9dcdcf1e5a3823c2d Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 8 Sep 2016 00:42:25 +0900 Subject: net: inet: diag: expose the socket mark to privileged processes. This adds the capability for a process that has CAP_NET_ADMIN on a socket to see the socket mark in socket dumps. Commit a52e95abf772 ("net: diag: allow socket bytecode filters to match socket marks") recently gave privileged processes the ability to filter socket dumps based on mark. This patch is complementary: it ensures that the mark is also passed to userspace in the socket's netlink attributes. It is useful for tools like ss which display information about sockets. Tested: https://android-review.googlesource.com/270210 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 4 ++-- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 49 ++++++++++++++++++++++++++++-------------- net/ipv4/udp_diag.c | 10 +++++---- net/sctp/sctp_diag.c | 20 +++++++++++------ 5 files changed, 56 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index feb04ea20f11..65da430e260f 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh); + const struct nlmsghdr *unlh, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, @@ -56,7 +56,7 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, - struct user_namespace *user_ns); + struct user_namespace *user_ns, bool net_admin); extern int inet_diag_register(const struct inet_diag_handler *handler); extern void inet_diag_unregister(const struct inet_diag_handler *handler); diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 5581206a08ae..b5c366f87b3e 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -123,6 +123,7 @@ enum { INET_DIAG_LOCALS, INET_DIAG_PEERS, INET_DIAG_PAD, + INET_DIAG_MARK, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index abfbe492ebfe..e4d16fc5bbb3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void) int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, - struct user_namespace *user_ns) + struct user_namespace *user_ns, + bool net_admin) { const struct inet_sock *inet = inet_sk(sk); @@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, } #endif + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + goto errout; + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; @@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 0; r->idiag_retrans = 0; - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { @@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, - user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, + portid, seq, nlmsg_flags, unlh, net_admin); } static int inet_twsk_diag_fill(struct sock *sk, @@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk, static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { + struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = inet_reqsk(sk)->num_retrans; + r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); @@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) + return -EMSGSIZE; + nlmsg_end(skb, nlh); return 0; } @@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, portid, seq, @@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - const struct nlattr *bc) + const struct nlattr *bc, + bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk, return inet_csk_diag_fill(sk, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, + net_admin); } static void twsk_build_assert(void) @@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, cb->args[3] > 0) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, + bc, net_admin) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -948,7 +965,7 @@ skip_listen_ht: sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh); + cb->nlh, net_admin); if (res < 0) { spin_unlock_bh(lock); goto done; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 58b79c0c0d69..9a89c10a55f0 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -20,7 +20,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return inet_sk_diag_fill(sk, NULL, skb, req, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, @@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, err = inet_sk_diag_fill(sk, NULL, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; @@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index f3508aa75815..807158e32f5f 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, int portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct list_head *addr_list; @@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, r->idiag_retrans = 0; } - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { @@ -203,6 +204,7 @@ struct sctp_comm_param { struct netlink_callback *cb; const struct inet_diag_req_v2 *r; const struct nlmsghdr *nlh; + bool net_admin; }; static size_t inet_assoc_attr_size(struct sctp_association *asoc) @@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) + nla_total_size(sizeof(struct inet_diag_meminfo)) @@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + commp->net_admin); release_sock(sk); if (err < 0) { WARN_ON(err == -EMSGSIZE); @@ -310,7 +314,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh) < 0) { + NLM_F_MULTI, cb->nlh, + commp->net_admin) < 0) { cb->args[3] = 1; err = 2; goto release; @@ -320,7 +325,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) if (inet_sctp_diag_fill(sk, assoc, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { + cb->nlh->nlmsg_seq, 0, cb->nlh, + commp->net_admin) < 0) { err = 2; goto release; } @@ -375,7 +381,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh) < 0) { + cb->nlh, commp->net_admin) < 0) { err = 2; goto out; } @@ -412,6 +418,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb, .skb = in_skb, .r = req, .nlh = nlh, + .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN), }; if (req->sdiag_family == AF_INET) { @@ -447,6 +454,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .skb = skb, .cb = cb, .r = r, + .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; /* eps hashtable dumps -- cgit v1.2.3 From 8c146bb9d59aa2ac45222171916ece186c4b3943 Mon Sep 17 00:00:00 2001 From: Thomas F Herbert Date: Wed, 7 Sep 2016 12:56:57 -0400 Subject: openvswitch: 802.1ad uapi changes. openvswitch: Add support for 8021.AD Change the description of the VLAN tpid field. Signed-off-by: Thomas F Herbert Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 54c3b4f4aceb..59ed3992c760 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -605,13 +605,13 @@ struct ovs_action_push_mpls { * @vlan_tci: Tag control identifier (TCI) to push. The CFI bit must be set * (but it will not be set in the 802.1Q header that is pushed). * - * The @vlan_tpid value is typically %ETH_P_8021Q. The only acceptable TPID - * values are those that the kernel module also parses as 802.1Q headers, to - * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN - * from having surprising results. + * The @vlan_tpid value is typically %ETH_P_8021Q or %ETH_P_8021AD. + * The only acceptable TPID values are those that the kernel module also parses + * as 802.1Q or 802.1AD headers, to prevent %OVS_ACTION_ATTR_PUSH_VLAN followed + * by %OVS_ACTION_ATTR_POP_VLAN from having surprising results. */ struct ovs_action_push_vlan { - __be16 vlan_tpid; /* 802.1Q TPID. */ + __be16 vlan_tpid; /* 802.1Q or 802.1ad TPID. */ __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; @@ -721,9 +721,10 @@ enum ovs_nat_attr { * is copied from the value to the packet header field, rest of the bits are * left unchanged. The non-masked value bits must be passed in as zeroes. * Masking is not supported for the %OVS_KEY_ATTR_TUNNEL attribute. - * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the - * packet. - * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet. + * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q or 802.1ad header + * onto the packet. + * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q or 802.1ad header + * from the packet. * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in * the nested %OVS_SAMPLE_ATTR_* attributes. * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the -- cgit v1.2.3 From fe19c4f971a55cea3be442d8032a5f6021702791 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 7 Sep 2016 12:56:58 -0400 Subject: vlan: Check for vlan ethernet types for 8021.q or 802.1ad This is to simplify using double tagged vlans. This function allows all valid vlan ethertypes to be checked in a single function call. Also replace some instances that check for both ETH_P_8021Q and ETH_P_8021AD. Patch based on one originally by Thomas F Herbert. Signed-off-by: Thomas F Herbert Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 49d4aef1f789..3319d97d789d 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -272,6 +272,23 @@ static inline int vlan_get_encap_level(struct net_device *dev) } #endif +/** + * eth_type_vlan - check for valid vlan ether type. + * @ethertype: ether type to check + * + * Returns true if the ether type is a vlan ether type. + */ +static inline bool eth_type_vlan(__be16 ethertype) +{ + switch (ethertype) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + return true; + default: + return false; + } +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -425,8 +442,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data; - if (veth->h_vlan_proto != htons(ETH_P_8021Q) && - veth->h_vlan_proto != htons(ETH_P_8021AD)) + if (!eth_type_vlan(veth->h_vlan_proto)) return -EINVAL; *vlan_tci = ntohs(veth->h_vlan_TCI); @@ -488,7 +504,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ - if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; @@ -506,8 +522,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; - } while (type == htons(ETH_P_8021Q) || - type == htons(ETH_P_8021AD)); + } while (eth_type_vlan(type)); } if (depth) @@ -572,8 +587,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && - likely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + likely(!eth_type_vlan(skb->protocol))) return false; return true; @@ -593,15 +607,14 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; - if (likely(protocol != htons(ETH_P_8021Q) && - protocol != htons(ETH_P_8021AD))) + if (likely(!eth_type_vlan(protocol))) return false; veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } - if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) + if (!eth_type_vlan(protocol)) return false; return true; -- cgit v1.2.3 From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001 From: Yaogong Wang Date: Wed, 7 Sep 2016 14:49:28 -0700 Subject: tcp: use an RB tree for ooo receive queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Ilpo Järvinen Acked-By: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 + include/linux/tcp.h | 7 +- include/net/tcp.h | 2 +- net/core/skbuff.c | 19 +++ net/ipv4/tcp.c | 4 +- net/ipv4/tcp_input.c | 330 +++++++++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 1 - 8 files changed, 218 insertions(+), 149 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cfb7219be665..4c5662f05bda 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } +void skb_rbtree_purge(struct rb_root *root); + void *netdev_alloc_frag(unsigned int fragsz); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7be9b1242354..c723a465125d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,10 +281,9 @@ struct tcp_sock { struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; - /* OOO segments go in this list. Note that socket lock must be held, - * as we do not use sk_buff_head lock. - */ - struct sk_buff_head out_of_order_queue; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d6ae36512429..fdfbedd61c67 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (skb_queue_empty(&tp->out_of_order_queue) && + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b68fa1..1e329d411242 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list) } EXPORT_SYMBOL(skb_queue_purge); +/** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + /** * skb_queue_head - queue a buffer at the list head * @list: list to use diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77311a92275c..a13fcb369f52 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __skb_queue_head_init(&tp->out_of_order_queue); + tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); @@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8cd02c0b056c..a5934c4c8cd4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk) /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); @@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ - if (skb_queue_empty(&tp->out_of_order_queue)) { + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } @@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; + bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; - bool fragstolen, eaten; + struct rb_node *p; - while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + p = rb_first(&tp->out_of_order_queue); + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); - __skb_unlink(skb, &tp->out_of_order_queue); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; @@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); + fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - tcp_fin(sk); - if (eaten) + else kfree_skb_partial(skb, fragstolen); + + if (unlikely(fin)) { + tcp_fin(sk); + /* tcp_fin() purges tp->out_of_order_queue, + * so we must end this loop right now. + */ + break; + } } } @@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct rb_node **p, *q, *parent; struct sk_buff *skb1; u32 seq, end_seq; + bool fragstolen; tcp_ecn_check_ce(tp, skb); @@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tp->rcv_nxt, seq, end_seq); - skb1 = skb_peek_tail(&tp->out_of_order_queue); - if (!skb1) { + p = &tp->out_of_order_queue.rb_node; + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = - TCP_SKB_CB(skb)->end_seq; + tp->selective_acks[0].start_seq = seq; + tp->selective_acks[0].end_seq = end_seq; } - __skb_queue_head(&tp->out_of_order_queue, skb); + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + tp->ooo_last_skb = skb; goto end; } - seq = TCP_SKB_CB(skb)->seq; - end_seq = TCP_SKB_CB(skb)->end_seq; - - if (seq == TCP_SKB_CB(skb1)->end_seq) { - bool fragstolen; - - if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); - } else { - tcp_grow_window(sk, skb); - kfree_skb_partial(skb, fragstolen); - skb = NULL; + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { +coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); + skb = NULL; + goto add_sack; + } + + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; } - - if (!tp->rx_opt.num_sacks || - tp->selective_acks[0].end_seq != seq) - goto add_sack; - - /* Common case: data arrive in order after hole. */ - tp->selective_acks[0].end_seq = end_seq; - goto end; - } - - /* Find place to insert this segment. */ - while (1) { - if (!after(TCP_SKB_CB(skb1)->seq, seq)) - break; - if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { - skb1 = NULL; - break; + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &tp->out_of_order_queue); + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb1); + goto add_sack; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + goto coalesce_done; } - skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + p = &parent->rb_right; } - /* Do skb overlap to previous one? */ - if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - /* All the bits are present. Drop. */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); - skb = NULL; - tcp_dsack_set(sk, seq, end_seq); - goto add_sack; - } - if (after(seq, TCP_SKB_CB(skb1)->seq)) { - /* Partial overlap. */ - tcp_dsack_set(sk, seq, - TCP_SKB_CB(skb1)->end_seq); - } else { - if (skb_queue_is_first(&tp->out_of_order_queue, - skb1)) - skb1 = NULL; - else - skb1 = skb_queue_prev( - &tp->out_of_order_queue, - skb1); - } - } - if (!skb1) - __skb_queue_head(&tp->out_of_order_queue, skb); - else - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); - /* And clean segments covered by new one as whole. */ - while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { - skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + /* Remove other segments covered by skb. */ + while ((q = rb_next(&skb->rbnode)) != NULL) { + skb1 = rb_entry(q, struct sk_buff, rbnode); if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; @@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } - __skb_unlink(skb1, &tp->out_of_order_queue); + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb1); } + /* If there is no skb after us, we are the last_skb ! */ + if (!q) + tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) @@ -4651,13 +4664,13 @@ queue_and_out: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); - if (!skb_queue_empty(&tp->out_of_order_queue)) { + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pingpong = 0; } @@ -4711,48 +4724,76 @@ drop: tcp_data_queue_ofo(sk, skb); } +static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) +{ + if (list) + return !skb_queue_is_last(list, skb) ? skb->next : NULL; + + return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); +} + static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *list) + struct sk_buff_head *list, + struct rb_root *root) { - struct sk_buff *next = NULL; + struct sk_buff *next = tcp_skb_next(skb, list); - if (!skb_queue_is_last(list, skb)) - next = skb_queue_next(list, skb); + if (list) + __skb_unlink(skb, list); + else + rb_erase(&skb->rbnode, root); - __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * - * If tail is NULL, this means until the end of the list. + * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff_head *list, - struct sk_buff *head, struct sk_buff *tail, - u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb, *n; + struct sk_buff *skb = head, *n; + struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find - * the point where collapsing can be useful. */ - skb = head; + * the point where collapsing can be useful. + */ restart: - end_of_skbs = true; - skb_queue_walk_from_safe(list, skb, n) { - if (skb == tail) - break; + for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { + n = tcp_skb_next(skb, list); + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; @@ -4770,13 +4811,10 @@ restart: break; } - if (!skb_queue_is_last(list, skb)) { - struct sk_buff *next = skb_queue_next(list, skb); - if (next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { - end_of_skbs = false; - break; - } + if (n && n != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { + end_of_skbs = false; + break; } /* Decided to skip this, advance start seq. */ @@ -4786,17 +4824,22 @@ restart: (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; + __skb_queue_head_init(&tmp); + while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) - return; + break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_queue_before(list, skb, nskb); + if (list) + __skb_queue_before(list, skb, nskb); + else + __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -4814,14 +4857,17 @@ restart: start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) - return; + goto end; } } } +end: + skb_queue_walk_safe(&tmp, skb, n) + tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs @@ -4830,43 +4876,43 @@ restart: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); - struct sk_buff *head; + struct sk_buff *skb, *head; + struct rb_node *p; u32 start, end; - if (!skb) + p = rb_first(&tp->out_of_order_queue); + skb = rb_entry_safe(p, struct sk_buff, rbnode); +new_range: + if (!skb) { + p = rb_last(&tp->out_of_order_queue); + /* Note: This is possible p is NULL here. We do not + * use rb_entry_safe(), as ooo_last_skb is valid only + * if rbtree is not empty. + */ + tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); return; - + } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; - head = skb; - - for (;;) { - struct sk_buff *next = NULL; - if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) - next = skb_queue_next(&tp->out_of_order_queue, skb); - skb = next; + for (head = skb;;) { + skb = tcp_skb_next(skb, NULL); - /* Segment is terminated when we see gap or when - * we are at the end of all the queue. */ + /* Range is terminated when we see a gap or when + * we are at the queue end. + */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, &tp->out_of_order_queue, + tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); - head = skb; - if (!skb) - break; - /* Start new segment */ + goto new_range; + } + + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) end = TCP_SKB_CB(skb)->end_seq; - } else { - if (before(TCP_SKB_CB(skb)->seq, start)) - start = TCP_SKB_CB(skb)->seq; - if (after(TCP_SKB_CB(skb)->end_seq, end)) - end = TCP_SKB_CB(skb)->end_seq; - } } } @@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct rb_node *node, *prev; - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); - - while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { - tcp_drop(sk, skb); + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); + tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; - } + node = prev; + } while (node); + tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) - tcp_collapse(sk, &sk->sk_receive_queue, + tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); @@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a75bf48d7950..04b989328558 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1845,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); #ifdef CONFIG_TCP_MD5SIG /* Clean up the MD5 key list, if any */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4b95ec4ed2c8..f63c73dc0acb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -488,7 +488,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd_cnt = 0; tcp_init_xmit_timers(newsk); - __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; -- cgit v1.2.3 From 34a3d4b2d1f1b7c81af79f6f93a6cef4c3a0f54a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 8 Sep 2016 13:55:56 -0400 Subject: xfrm: fix header file comment reference to struct xfrm_replay_state_esn Reported-by: Paul Wouters Signed-off-by: Richard Guy Briggs Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 143338978b48..1fc62b239f1b 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -298,7 +298,7 @@ enum xfrm_attr_type_t { XFRMA_ALG_AUTH_TRUNC, /* struct xfrm_algo_auth */ XFRMA_MARK, /* struct xfrm_mark */ XFRMA_TFCPAD, /* __u32 */ - XFRMA_REPLAY_ESN_VAL, /* struct xfrm_replay_esn */ + XFRMA_REPLAY_ESN_VAL, /* struct xfrm_replay_state_esn */ XFRMA_SA_EXTRA_FLAGS, /* __u32 */ XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ -- cgit v1.2.3 From 3e9b3112ec74f192eaab976c3889e34255cae940 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2016 12:46:44 +0100 Subject: add basic register-field manipulation macros Common approach to accessing register fields is to define structures or sets of macros containing mask and shift pair. Operations on the register are then performed as follows: field = (reg >> shift) & mask; reg &= ~(mask << shift); reg |= (field & mask) << shift; Defining shift and mask separately is tedious. Ivo van Doorn came up with an idea of computing them at compilation time based on a single shifted mask (later refined by Felix) which can be used like this: #define REG_FIELD 0x000ff000 field = FIELD_GET(REG_FIELD, reg); reg &= ~REG_FIELD; reg |= FIELD_PREP(REG_FIELD, field); FIELD_{GET,PREP} macros take care of finding out what the appropriate shift is based on compilation time ffs operation. GENMASK can be used to define registers (which is usually less error-prone and easier to match with datasheets). This approach is the most convenient I've seen so to limit code multiplication let's move the macros to a global header file. Attempts to use static inlines instead of macros failed due to false positive triggering of BUILD_BUG_ON()s, especially with GCC < 6.0. Signed-off-by: Jakub Kicinski Reviewed-by: Dinan Gunawardena Signed-off-by: Kalle Valo --- include/linux/bitfield.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/bug.h | 3 ++ 2 files changed, 96 insertions(+) create mode 100644 include/linux/bitfield.h (limited to 'include') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h new file mode 100644 index 000000000000..f6505d83069d --- /dev/null +++ b/include/linux/bitfield.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2014 Felix Fietkau + * Copyright (C) 2004 - 2009 Ivo van Doorn + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BITFIELD_H +#define _LINUX_BITFIELD_H + +#include + +/* + * Bitfield access macros + * + * FIELD_{GET,PREP} macros take as first parameter shifted mask + * from which they extract the base mask and shift amount. + * Mask must be a compilation time constant. + * + * Example: + * + * #define REG_FIELD_A GENMASK(6, 0) + * #define REG_FIELD_B BIT(7) + * #define REG_FIELD_C GENMASK(15, 8) + * #define REG_FIELD_D GENMASK(31, 16) + * + * Get: + * a = FIELD_GET(REG_FIELD_A, reg); + * b = FIELD_GET(REG_FIELD_B, reg); + * + * Set: + * reg = FIELD_PREP(REG_FIELD_A, 1) | + * FIELD_PREP(REG_FIELD_B, 0) | + * FIELD_PREP(REG_FIELD_C, c) | + * FIELD_PREP(REG_FIELD_D, 0x40); + * + * Modify: + * reg &= ~REG_FIELD_C; + * reg |= FIELD_PREP(REG_FIELD_C, c); + */ + +#define __bf_shf(x) (__builtin_ffsll(x) - 1) + +#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ + ({ \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ + _pfx "mask is not constant"); \ + BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \ + BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ + ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ + _pfx "value too large for the field"); \ + BUILD_BUG_ON_MSG((_mask) > (typeof(_reg))~0ull, \ + _pfx "type of reg too small for mask"); \ + __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ + (1ULL << __bf_shf(_mask))); \ + }) + +/** + * FIELD_PREP() - prepare a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP() masks and shifts up the value. The result should + * be combined with other fields of the bitfield using logical OR. + */ +#define FIELD_PREP(_mask, _val) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ + ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ + }) + +/** + * FIELD_GET() - extract a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_reg: 32bit value of entire bitfield + * + * FIELD_GET() extracts the field specified by @_mask from the + * bitfield passed in as @_reg by masking and shifting it down. + */ +#define FIELD_GET(_mask, _reg) \ + ({ \ + __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ + (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ + }) + +#endif diff --git a/include/linux/bug.h b/include/linux/bug.h index e51b0709e78d..292d6a10b0c2 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -13,6 +13,7 @@ enum bug_trap_type { struct pt_regs; #ifdef __CHECKER__ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_ZERO(e) (0) #define BUILD_BUG_ON_NULL(e) ((void*)0) @@ -24,6 +25,8 @@ struct pt_regs; #else /* __CHECKER__ */ /* Force a compilation error if a constant expression is not a power of 2 */ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON(((n) & ((n) - 1)) != 0) #define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) -- cgit v1.2.3 From 634faf3686900ccdee87b77e2c56df8b2159912b Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Mon, 5 Sep 2016 11:42:12 +0100 Subject: brcmfmac: add support for bcm4339 chip with modalias sdio:c00v02D0d4339 The driver already supports the bcm4339 chipset but only for the variant that shares the same modalias as the bcm4335, ie. sdio:c00v02D0d4335. It turns out that there are also bcm4339 devices out there that have a more distiguishable modalias sdio:c00v02D0d4339. Reported-by: Steve deRosier Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 + drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 3 ++- include/linux/mmc/sdio_ids.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index f549c25608d6..03404cbe9237 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -1101,6 +1101,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = { BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43341), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43362), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354), diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 68ab3ac15650..589a49cd9cd5 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -3757,7 +3757,8 @@ static u32 brcmf_sdio_buscore_read32(void *ctx, u32 addr) u32 val, rev; val = brcmf_sdiod_regrl(sdiodev, addr, NULL); - if (sdiodev->func[0]->device == SDIO_DEVICE_ID_BROADCOM_4335_4339 && + if ((sdiodev->func[0]->device == SDIO_DEVICE_ID_BROADCOM_4335_4339 || + sdiodev->func[0]->device == SDIO_DEVICE_ID_BROADCOM_4339) && addr == CORE_CC_REG(SI_ENUM_BASE, chipid)) { rev = (val & CID_REV_MASK) >> CID_REV_SHIFT; if (rev >= 2) { diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 0d126aeb3ec0..d43ef96bf075 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -32,6 +32,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43340 0xa94c #define SDIO_DEVICE_ID_BROADCOM_43341 0xa94d #define SDIO_DEVICE_ID_BROADCOM_4335_4339 0x4335 +#define SDIO_DEVICE_ID_BROADCOM_4339 0x4339 #define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 #define SDIO_DEVICE_ID_BROADCOM_4345 0x4345 -- cgit v1.2.3 From 3c15b8e112d7351b2586c649a759318548523364 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Tue, 6 Sep 2016 22:35:47 +0800 Subject: netfilter: nf_conntrack: remove unused ctl_table_path member in nf_conntrack_l3proto After commit adf0516845bc ("netfilter: remove ip_conntrack* sysctl compat code"), ctl_table_path member in struct nf_conntrack_l3proto{} is not used anymore, remove it. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index cdc920b4c4c2..8992e4229da9 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -63,10 +63,6 @@ struct nf_conntrack_l3proto { size_t nla_size; -#ifdef CONFIG_SYSCTL - const char *ctl_table_path; -#endif /* CONFIG_SYSCTL */ - /* Init l3proto pernet data */ int (*init_net)(struct net *net); -- cgit v1.2.3 From f035a51536af9802f55d8c79bd87f184ebffb093 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:29 +0200 Subject: bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros Add BPF_SIZEOF() and BPF_FIELD_SIZEOF() macros to improve the code a bit which otherwise often result in overly long bytes_to_bpf_size(sizeof()) and bytes_to_bpf_size(FIELD_SIZEOF()) lines. So place them into a macro helper instead. Moreover, we currently have a BUILD_BUG_ON(BPF_FIELD_SIZEOF()) check in convert_bpf_extensions(), but we should rather make that generic as well and add a BUILD_BUG_ON() test in all BPF_SIZEOF()/BPF_FIELD_SIZEOF() users to detect any rewriter size issues at compile time. Note, there are currently none, but we want to assert that it stays this way. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 12 ++++++------ net/core/filter.c | 15 +++++++-------- 3 files changed, 27 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index a16439b99fd9..7fabad8dc3fc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -314,6 +314,20 @@ struct bpf_prog_aux; bpf_size; \ }) +#define BPF_SIZEOF(type) \ + ({ \ + const int __size = bytes_to_bpf_size(sizeof(type)); \ + BUILD_BUG_ON(__size < 0); \ + __size; \ + }) + +#define BPF_FIELD_SIZEOF(type, field) \ + ({ \ + const int __size = bytes_to_bpf_size(FIELD_SIZEOF(type, field)); \ + BUILD_BUG_ON(__size < 0); \ + __size; \ + }) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d3869b03d9fe..e63d7d435796 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,18 +583,18 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), - dst_reg, src_reg, + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), dst_reg, src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, offsetof(struct perf_sample_data, period)); break; default: - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), - dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + regs), dst_reg, src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), - dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); break; } diff --git a/net/core/filter.c b/net/core/filter.c index 628ed8c7d38d..120c813ef030 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -233,9 +233,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); - BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ @@ -2685,7 +2684,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), dst_reg, src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); @@ -2750,7 +2749,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, break; case offsetof(struct __sk_buff, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), dst_reg, src_reg, offsetof(struct sk_buff, data)); break; @@ -2759,8 +2758,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, data_end); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), - dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, + ctx_off); break; case offsetof(struct __sk_buff, tc_index): @@ -2795,12 +2794,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct xdp_md, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), dst_reg, src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), dst_reg, src_reg, offsetof(struct xdp_buff, data_end)); break; -- cgit v1.2.3 From f3694e00123802d688180e7ae90b240669910e3c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:31 +0200 Subject: bpf: add BPF_CALL_x macros for declaring helpers This work adds BPF_CALL_() macros and converts all the eBPF helper functions to use them, in a similar fashion like we do with SYSCALL_DEFINE() macros that are used today. Motivation for this is to hide all the register handling and all necessary casts from the user, so that it is done automatically in the background when adding a BPF_CALL_() call. This makes current helpers easier to review, eases to write future helpers, avoids getting the casting mess wrong, and allows for extending all helpers at once (f.e. build time checks, etc). It also helps detecting more easily in code reviews that unused registers are not instrumented in the code by accident, breaking compatibility with existing programs. BPF_CALL_() internals are quite similar to SYSCALL_DEFINE() ones with some fundamental differences, for example, for generating the actual helper function that carries all u64 regs, we need to fill unused regs, so that we always end up with 5 u64 regs as an argument. I reviewed several 0-5 generated BPF_CALL_() variants of the .i results and they look all as expected. No sparse issue spotted. We let this also sit for a few days with Fengguang's kbuild test robot, and there were no issues seen. On s390, it barked on the "uses dynamic stack allocation" notice, which is an old one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion to the call wrapper, just telling that the perf raw record/frag sits on stack (gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests and they were fine as well. All eBPF helpers are now converted to use these macros, getting rid of a good chunk of all the raw castings. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 50 ++++++++++++++++++ kernel/bpf/core.c | 2 +- kernel/bpf/helpers.c | 46 +++++------------ kernel/bpf/stackmap.c | 5 +- kernel/trace/bpf_trace.c | 75 +++++++++++++-------------- net/core/filter.c | 129 ++++++++++++++++++----------------------------- 6 files changed, 149 insertions(+), 158 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fabad8dc3fc..1f09c521adfe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -328,6 +328,56 @@ struct bpf_prog_aux; __size; \ }) +#define __BPF_MAP_0(m, v, ...) v +#define __BPF_MAP_1(m, v, t, a, ...) m(t, a) +#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) +#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) +#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) +#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) + +#define __BPF_REG_0(...) __BPF_PAD(5) +#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) +#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) +#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) +#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) +#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) + +#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) +#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) + +#define __BPF_CAST(t, a) \ + (__force t) \ + (__force \ + typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ + (unsigned long)0, (t)0))) a +#define __BPF_V void +#define __BPF_N + +#define __BPF_DECL_ARGS(t, a) t a +#define __BPF_DECL_REGS(t, a) u64 a + +#define __BPF_PAD(n) \ + __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ + u64, __ur_3, u64, __ur_4, u64, __ur_5) + +#define BPF_CALL_x(x, name, ...) \ + static __always_inline \ + u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ + u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ + { \ + return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ + } \ + static __always_inline \ + u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) + +#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) +#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) +#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) +#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) +#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) +#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..7b7baaed9ed4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void) prandom_init_once(&bpf_user_rnd_state); } -u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6df73bd1ba34..a5b8bf8cfcfd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -16,6 +16,7 @@ #include #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -26,24 +27,10 @@ * if program is allowed to access maps, so check rcu_read_lock_held in * all three functions. */ -static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - /* verifier checked that R1 contains a valid pointer to bpf_map - * and R2 points to a program stack and map->key_size bytes were - * initialized - */ - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value; - WARN_ON_ONCE(!rcu_read_lock_held()); - - value = map->ops->map_lookup_elem(map, key); - - /* lookup() returns either pointer to element value or NULL - * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type - */ - return (unsigned long) value; + return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { @@ -54,15 +41,11 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, + void *, value, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value = (void *) (unsigned long) r3; - WARN_ON_ONCE(!rcu_read_lock_held()); - - return map->ops->map_update_elem(map, key, value, r4); + return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { @@ -75,13 +58,9 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - WARN_ON_ONCE(!rcu_read_lock_held()); - return map->ops->map_delete_elem(map, key); } @@ -99,7 +78,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } @@ -110,7 +89,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); @@ -122,7 +101,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -138,7 +117,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; @@ -158,10 +137,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; - char *buf = (char *) (long) r1; if (unlikely(!task)) goto err_clear; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,10 +116,9 @@ free_smap: return ERR_PTR(err); } -u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e63d7d435796..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -61,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - void *dst = (void *) (long) r1; - int ret, size = (int) r2; - void *unsafe_ptr = (void *) (long) r3; + int ret; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) @@ -83,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, + u32, size) { - void *unsafe_ptr = (void *) (long) r1; - void *src = (void *) (long) r2; - int size = (int) r3; - /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. @@ -130,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ -static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { - char *fmt = (char *) (long) r1; bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; @@ -178,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) switch (fmt_cnt) { case 1: - unsafe_addr = r3; - r3 = (long) buf; + unsafe_addr = arg1; + arg1 = (long) buf; break; case 2: - unsafe_addr = r4; - r4 = (long) buf; + unsafe_addr = arg2; + arg2 = (long) buf; break; case 3: - unsafe_addr = r5; - r5 = (long) buf; + unsafe_addr = arg3; + arg3 = (long) buf; break; } buf[0] = 0; @@ -209,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) } return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, - mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, - mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); + mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, + mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, + mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -233,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; @@ -312,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return 0; } -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { - struct pt_regs *regs = (struct pt_regs *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; - void *data = (void *)(long) r4; struct perf_raw_record raw = { .frag = { .size = size, @@ -367,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, return __bpf_perf_event_output(regs, map, flags, &raw); } -static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_task) { return (long) current; } @@ -378,16 +370,13 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { - struct bpf_map *map = (struct bpf_map *)(long)r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; - u32 idx = (u32)r2; if (unlikely(in_interrupt())) return -EINVAL; - if (unlikely(idx >= array->map.max_entries)) return -E2BIG; @@ -481,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; -static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it - * from there and call the same bpf_perf_event_output() helper + * from there and call the same bpf_perf_event_output() helper inline. */ - u64 ctx = *(long *)(uintptr_t)r1; - - return bpf_perf_event_output(ctx, r2, index, r4, size); + return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { @@ -504,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags) { - u64 ctx = *(long *)(uintptr_t)r1; + struct pt_regs *regs = *(struct pt_regs **)tp_buff; - return bpf_get_stackid(ctx, r2, r3, r4, r5); + /* + * Same comment as in bpf_perf_event_output_tp(), only that this time + * the other helper's function body cannot be inlined due to being + * external, thus we need to call raw helper function. + */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { diff --git a/net/core/filter.c b/net/core/filter.c index d6d9bb89ce3a..298b146b47e7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) { - return skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff(skb); } -static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_0(__get_raw_cpu_id) { return raw_smp_processor_id(); } @@ -1376,12 +1374,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; - void *from = (void *) (long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) @@ -1416,12 +1411,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) { - const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; - unsigned int offset = (unsigned int) r2; - void *to = (void *)(unsigned long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(offset > 0xffff)) @@ -1449,10 +1441,9 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) @@ -1494,12 +1485,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | @@ -1547,12 +1537,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u64 diff_size = from_size + to_size; - __be32 *from = (__be32 *) (long) r1; - __be32 *to = (__be32 *) (long) r3; + u32 diff_size = from_size + to_size; int i, j = 0; /* This is quite flexible, some examples: @@ -1610,9 +1599,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } -static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; struct net_device *dev; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1648,7 +1636,7 @@ struct redirect_info { static DEFINE_PER_CPU(struct redirect_info, redirect_info); -static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -1687,9 +1675,9 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { - return task_get_classid((struct sk_buff *) (unsigned long) r1); + return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { @@ -1699,9 +1687,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { - return dst_tclassid((struct sk_buff *) (unsigned long) r1); + return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1711,14 +1699,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ - return skb_get_hash((struct sk_buff *) (unsigned long) r1); + return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { @@ -1728,10 +1716,9 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 vlan_proto = (__force __be16) r2; int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && @@ -1756,9 +1743,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); -static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; int ret; bpf_push_mac_rcsum(skb); @@ -1933,10 +1919,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) return -ENOTSUPP; } -static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 proto = (__force __be16) r2; int ret; if (unlikely(flags)) @@ -1973,11 +1958,8 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 pkt_type = r2; - /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) @@ -2028,12 +2010,11 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) { - struct sk_buff *skb = (struct sk_buff *)(long) r1; u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); - u32 new_len = (u32) r2; int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) @@ -2113,13 +2094,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, return 0; } -static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, - u64 meta_size) +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) { - struct sk_buff *skb = (struct sk_buff *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; - void *meta = (void *)(long) r4; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; @@ -2146,10 +2124,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags) return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } -static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; @@ -2214,10 +2191,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *to = (u8 *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; @@ -2252,10 +2227,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { static struct metadata_dst __percpu *md_dst; -static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; @@ -2273,7 +2247,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); - from = (struct bpf_tunnel_key *)compat; + from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; @@ -2323,10 +2297,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *from = (u8 *) (long) r2; struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); @@ -2372,23 +2345,20 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) { - struct sk_buff *skb = (struct sk_buff *)(long)r1; - struct bpf_map *map = (struct bpf_map *)(long)r2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; - u32 i = (u32)r3; sk = skb->sk; if (!sk || !sk_fullsock(sk)) return -ENOENT; - - if (unlikely(i >= array->map.max_entries)) + if (unlikely(idx >= array->map.max_entries)) return -E2BIG; - cgrp = READ_ONCE(array->ptrs[i]); + cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; @@ -2411,13 +2381,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, return 0; } -static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4, - u64 meta_size) +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) { - struct xdp_buff *xdp = (struct xdp_buff *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; - void *meta = (void *)(long) r4; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; -- cgit v1.2.3 From d817f432c2ab7639a4f69de73eafdc55e57c45ad Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 8 Sep 2016 16:23:45 +0300 Subject: net/ip_tunnels: Introduce tunnel_id_to_key32() and key32_to_tunnel_id() Add utility functions to convert a 32 bits key into a 64 bits tunnel and vice versa. These functions will be used instead of cloning code in GRE and VXLAN, and in tc act_iptunnel which will be introduced in a following patch in this patchset. Signed-off-by: Amir Vadai Signed-off-by: Hadar Hen Zion Reviewed-by: Shmulik Ladkani Acked-by: Jiri Benc Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 ++-- include/net/ip_tunnels.h | 19 +++++++++++++++++++ include/net/vxlan.h | 18 ------------------ net/ipv4/ip_gre.c | 23 ++--------------------- 4 files changed, 23 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 199dec033cf8..4bfeb9765c55 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1291,7 +1291,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) struct metadata_dst *tun_dst; tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, - vxlan_vni_to_tun_id(vni), sizeof(*md)); + key32_to_tunnel_id(vni), sizeof(*md)); if (!tun_dst) goto drop; @@ -1945,7 +1945,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto drop; } dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; - vni = vxlan_tun_id_to_vni(info->key.tun_id); + vni = tunnel_id_to_key32(info->key.tun_id); remote_ip.sa.sa_family = ip_tunnel_info_af(info); if (remote_ip.sa.sa_family == AF_INET) { remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a5e7035fb93f..e598c639aa6f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -222,6 +222,25 @@ static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } +static inline __be64 key32_to_tunnel_id(__be32 key) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)key; +#else + return (__force __be64)((__force u64)key << 32); +#endif +} + +/* Returns the least-significant 32 bits of a __be64. */ +static inline __be32 tunnel_id_to_key32(__be64 tun_id) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)tun_id; +#else + return (__force __be32)((__force u64)tun_id >> 32); +#endif +} + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index b96d0360c095..0255613a54a4 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -350,24 +350,6 @@ static inline __be32 vxlan_vni_field(__be32 vni) #endif } -static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) -{ -#if defined(__BIG_ENDIAN) - return (__force __be32)tun_id; -#else - return (__force __be32)((__force u64)tun_id >> 32); -#endif -} - -static inline __be64 vxlan_vni_to_tun_id(__be32 vni) -{ -#if defined(__BIG_ENDIAN) - return (__force __be64)vni; -#else - return (__force __be64)((u64)(__force u32)vni << 32); -#endif -} - static inline size_t vxlan_rco_start(__be32 vni_field) { return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 113cc43df789..576f705d8180 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } -static __be64 key_to_tunnel_id(__be32 key) -{ -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u32)key); -#else - return (__force __be64)((__force u64)key << 32); -#endif -} - -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 tunnel_id_to_key(__be64 x) -{ -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { @@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, __be64 tun_id; flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); - tun_id = key_to_tunnel_id(tpi->key); + tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; @@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; -- cgit v1.2.3 From 2ff378b7474feac1ec665d01e4dfc6907cccc11c Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 8 Sep 2016 16:23:46 +0300 Subject: net/dst: Utility functions to build dst_metadata without supplying an skb Extract __ip_tun_set_dst() and __ipv6_tun_set_dst() out of ip_tun_rx_dst() and ipv6_tun_rx_dst(), to be used without supplying an skb. Signed-off-by: Amir Vadai Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Reviewed-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 52 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 5db9f5910428..6965c8f68ade 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -112,12 +112,13 @@ static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb return &dst->u.tun_info; } -static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, - __be16 flags, - __be64 tunnel_id, - int md_size) +static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, + __be32 daddr, + __u8 tos, __u8 ttl, + __be16 flags, + __be64 tunnel_id, + int md_size) { - const struct iphdr *iph = ip_hdr(skb); struct metadata_dst *tun_dst; tun_dst = tun_rx_dst(md_size); @@ -125,17 +126,30 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, return NULL; ip_tunnel_key_init(&tun_dst->u.tun_info.key, - iph->saddr, iph->daddr, iph->tos, iph->ttl, + saddr, daddr, tos, ttl, 0, 0, 0, tunnel_id, flags); return tun_dst; } -static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, +static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, __be16 flags, __be64 tunnel_id, int md_size) { - const struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + + return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + flags, tunnel_id, md_size); +} + +static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 tos, __u8 ttl, + __be32 label, + __be16 flags, + __be64 tunnel_id, + int md_size) +{ struct metadata_dst *tun_dst; struct ip_tunnel_info *info; @@ -150,14 +164,26 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, info->key.tp_src = 0; info->key.tp_dst = 0; - info->key.u.ipv6.src = ip6h->saddr; - info->key.u.ipv6.dst = ip6h->daddr; + info->key.u.ipv6.src = *saddr; + info->key.u.ipv6.dst = *daddr; - info->key.tos = ipv6_get_dsfield(ip6h); - info->key.ttl = ip6h->hop_limit; - info->key.label = ip6_flowlabel(ip6h); + info->key.tos = tos; + info->key.ttl = ttl; + info->key.label = label; return tun_dst; } +static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, + __be16 flags, + __be64 tunnel_id, + int md_size) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + + return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, + ipv6_get_dsfield(ip6h), ip6h->hop_limit, + ip6_flowlabel(ip6h), flags, tunnel_id, + md_size); +} #endif /* __NET_DST_METADATA_H */ -- cgit v1.2.3 From bc3103f1ed405de587fa43d8b0671e615505a700 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 8 Sep 2016 16:23:47 +0300 Subject: net/sched: cls_flower: Classify packet in ip tunnels Introduce classifying by metadata extracted by the tunnel device. Outer header fields - source/dest ip and tunnel id, are extracted from the metadata when classifying. For example, the following will add a filter on the ingress Qdisc of shared vxlan device named 'vxlan0'. To forward packets with outer src ip 11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be forwarded to tap device 'vnet0' (after metadata is released): $ tc filter add dev vxlan0 protocol ip parent ffff: \ flower \ enc_src_ip 11.11.0.2 \ enc_dst_ip 11.11.0.1 \ enc_key_id 11 \ dst_ip 11.11.11.1 \ action tunnel_key release \ action mirred egress redirect dev vnet0 The action tunnel_key, will be introduced in the next patch in this series. Signed-off-by: Amir Vadai Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 11 +++++ net/sched/cls_flower.c | 100 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 51b5b247fb5a..f9c287c67eae 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -431,6 +431,17 @@ enum { TCA_FLOWER_KEY_VLAN_ID, TCA_FLOWER_KEY_VLAN_PRIO, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index cf9ad5b50889..b084b2aab2d7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -23,9 +23,13 @@ #include #include +#include +#include + struct fl_flow_key { int indev_ifindex; struct flow_dissector_key_control control; + struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; struct flow_dissector_key_vlan vlan; @@ -35,6 +39,11 @@ struct fl_flow_key { struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; + struct flow_dissector_key_keyid enc_key_id; + union { + struct flow_dissector_key_ipv4_addrs enc_ipv4; + struct flow_dissector_key_ipv6_addrs enc_ipv6; + }; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -124,11 +133,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); + + info = skb_tunnel_info(skb); + if (info) { + struct ip_tunnel_key *key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_key.enc_ipv4.src = key->u.ipv4.src; + skb_key.enc_ipv4.dst = key->u.ipv4.dst; + break; + case AF_INET6: + skb_key.enc_ipv6.src = key->u.ipv6.src; + skb_key.enc_ipv6.dst = key->u.ipv6.dst; + break; + } + + skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); + } + skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. @@ -297,7 +326,15 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 }, - + [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, }; static void fl_set_key_val(struct nlattr **tb, @@ -409,6 +446,40 @@ static int fl_set_key(struct net *net, struct nlattr **tb, sizeof(key->tp.dst)); } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || + tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + fl_set_key_val(tb, &key->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC, + &mask->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, + sizeof(key->enc_ipv4.src)); + fl_set_key_val(tb, &key->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST, + &mask->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK, + sizeof(key->enc_ipv4.dst)); + } + + if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] || + tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) { + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + fl_set_key_val(tb, &key->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC, + &mask->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, + sizeof(key->enc_ipv6.src)); + fl_set_key_val(tb, &key->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST, + &mask->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK, + sizeof(key->enc_ipv6.dst)); + } + + fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID, + &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID, + sizeof(key->enc_key_id.keyid)); + return 0; } @@ -821,6 +892,33 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->tp.dst)))) goto nla_put_failure; + if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && + (fl_dump_key_val(skb, &key->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, + sizeof(key->enc_ipv4.src)) || + fl_dump_key_val(skb, &key->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK, + sizeof(key->enc_ipv4.dst)))) + goto nla_put_failure; + else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && + (fl_dump_key_val(skb, &key->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, + sizeof(key->enc_ipv6.src)) || + fl_dump_key_val(skb, &key->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST, + &mask->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK, + sizeof(key->enc_ipv6.dst)))) + goto nla_put_failure; + + if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, + &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, + sizeof(key->enc_key_id))) + goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); if (tcf_exts_dump(skb, &f->exts)) -- cgit v1.2.3 From d0f6dd8a914f42c6f1a3a8c08caa16559d3d9a1b Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Thu, 8 Sep 2016 16:23:48 +0300 Subject: net/sched: Introduce act_tunnel_key This action could be used before redirecting packets to a shared tunnel device, or when redirecting packets arriving from a such a device. The action will release the metadata created by the tunnel device (decap), or set the metadata with the specified values for encap operation. For example, the following flower filter will forward all ICMP packets destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before redirecting, a metadata for the vxlan tunnel is created using the tunnel_key action and it's arguments: $ tc filter add dev net0 protocol ip parent ffff: \ flower \ ip_proto 1 \ dst_ip 11.11.11.2 \ action tunnel_key set \ src_ip 11.11.0.1 \ dst_ip 11.11.0.2 \ id 11 \ action mirred egress redirect dev vxlan0 Signed-off-by: Amir Vadai Signed-off-by: Hadar Hen Zion Reviewed-by: Shmulik Ladkani Acked-by: Jamal Hadi Salim Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tc_act/tc_tunnel_key.h | 30 +++ include/uapi/linux/tc_act/tc_tunnel_key.h | 41 ++++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/act_tunnel_key.c | 351 ++++++++++++++++++++++++++++++ 5 files changed, 434 insertions(+) create mode 100644 include/net/tc_act/tc_tunnel_key.h create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h create mode 100644 net/sched/act_tunnel_key.c (limited to 'include') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h new file mode 100644 index 000000000000..253f8da6c2a6 --- /dev/null +++ b/include/net/tc_act/tc_tunnel_key.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016, Amir Vadai + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NET_TC_TUNNEL_KEY_H +#define __NET_TC_TUNNEL_KEY_H + +#include + +struct tcf_tunnel_key_params { + struct rcu_head rcu; + int tcft_action; + int action; + struct metadata_dst *tcft_enc_metadata; +}; + +struct tcf_tunnel_key { + struct tc_action common; + struct tcf_tunnel_key_params __rcu *params; +}; + +#define to_tunnel_key(a) ((struct tcf_tunnel_key *)a) + +#endif /* __NET_TC_TUNNEL_KEY_H */ diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h new file mode 100644 index 000000000000..890106ff16e6 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Amir Vadai + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_TC_TUNNEL_KEY_H +#define __LINUX_TC_TUNNEL_KEY_H + +#include + +#define TCA_ACT_TUNNEL_KEY 17 + +#define TCA_TUNNEL_KEY_ACT_SET 1 +#define TCA_TUNNEL_KEY_ACT_RELEASE 2 + +struct tc_tunnel_key { + tc_gen; + int t_action; +}; + +enum { + TCA_TUNNEL_KEY_UNSPEC, + TCA_TUNNEL_KEY_TM, + TCA_TUNNEL_KEY_PARMS, + TCA_TUNNEL_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_TUNNEL_KEY_ENC_IPV4_DST, /* be32 */ + TCA_TUNNEL_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_TUNNEL_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_TUNNEL_KEY_ENC_KEY_ID, /* be64 */ + TCA_TUNNEL_KEY_PAD, + __TCA_TUNNEL_KEY_MAX, +}; + +#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ccf931b3b94c..72e3426fa48f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -761,6 +761,17 @@ config NET_ACT_IFE To compile this code as a module, choose M here: the module will be called act_ife. +config NET_ACT_TUNNEL_KEY + tristate "IP tunnel metadata manipulation" + depends on NET_CLS_ACT + ---help--- + Say Y here to set/release ip tunnel metadata. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_tunnel_key. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE diff --git a/net/sched/Makefile b/net/sched/Makefile index ae088a5a9d95..b9d046b9535a 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o +obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c new file mode 100644 index 000000000000..dceff7412dc3 --- /dev/null +++ b/net/sched/act_tunnel_key.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2016, Amir Vadai + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TUNNEL_KEY_TAB_MASK 15 + +static int tunnel_key_net_id; +static struct tc_action_ops act_tunnel_key_ops; + +static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params; + int action; + + rcu_read_lock(); + + params = rcu_dereference(t->params); + + tcf_lastuse_update(&t->tcf_tm); + bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); + action = params->action; + + switch (params->tcft_action) { + case TCA_TUNNEL_KEY_ACT_RELEASE: + skb_dst_drop(skb); + break; + case TCA_TUNNEL_KEY_ACT_SET: + skb_dst_drop(skb); + skb_dst_set(skb, dst_clone(¶ms->tcft_enc_metadata->dst)); + break; + default: + WARN_ONCE(1, "Bad tunnel_key action %d.\n", + params->tcft_action); + break; + } + + rcu_read_unlock(); + + return action; +} + +static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { + [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, + [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, +}; + +static int tunnel_key_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; + struct tcf_tunnel_key_params *params_old; + struct tcf_tunnel_key_params *params_new; + struct metadata_dst *metadata = NULL; + struct tc_tunnel_key *parm; + struct tcf_tunnel_key *t; + bool exists = false; + __be64 key_id; + int ret = 0; + int err; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + switch (parm->t_action) { + case TCA_TUNNEL_KEY_ACT_RELEASE: + break; + case TCA_TUNNEL_KEY_ACT_SET: + if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + ret = -EINVAL; + goto err_out; + } + + key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); + + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && + tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { + __be32 saddr; + __be32 daddr; + + saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); + daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); + + metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, + TUNNEL_KEY, key_id, 0); + } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && + tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { + struct in6_addr saddr; + struct in6_addr daddr; + + saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); + daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); + + metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0, + TUNNEL_KEY, key_id, 0); + } + + if (!metadata) { + ret = -EINVAL; + goto err_out; + } + + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; + break; + default: + goto err_out; + } + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + &act_tunnel_key_ops, bind, true); + if (ret) + return ret; + + ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; + } + + t = to_tunnel_key(*a); + + ASSERT_RTNL(); + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_hash_release(*a, bind); + return -ENOMEM; + } + + params_old = rtnl_dereference(t->params); + + params_new->action = parm->action; + params_new->tcft_action = parm->t_action; + params_new->tcft_enc_metadata = metadata; + + rcu_assign_pointer(t->params, params_new); + + if (params_old) + kfree_rcu(params_old, rcu); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, *a); + + return ret; + +err_out: + if (exists) + tcf_hash_release(*a, bind); + return ret; +} + +static void tunnel_key_release(struct tc_action *a, int bind) +{ + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params; + + rcu_read_lock(); + params = rcu_dereference(t->params); + + if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) + dst_release(¶ms->tcft_enc_metadata->dst); + + kfree_rcu(params, rcu); + + rcu_read_unlock(); +} + +static int tunnel_key_dump_addresses(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + unsigned short family = ip_tunnel_info_af(info); + + if (family == AF_INET) { + __be32 saddr = info->key.u.ipv4.src; + __be32 daddr = info->key.u.ipv4.dst; + + if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) && + !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr)) + return 0; + } + + if (family == AF_INET6) { + const struct in6_addr *saddr6 = &info->key.u.ipv6.src; + const struct in6_addr *daddr6 = &info->key.u.ipv6.dst; + + if (!nla_put_in6_addr(skb, + TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) && + !nla_put_in6_addr(skb, + TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6)) + return 0; + } + + return -EINVAL; +} + +static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params; + struct tc_tunnel_key opt = { + .index = t->tcf_index, + .refcnt = t->tcf_refcnt - ref, + .bindcnt = t->tcf_bindcnt - bind, + }; + struct tcf_t tm; + int ret = -1; + + rcu_read_lock(); + params = rcu_dereference(t->params); + + opt.t_action = params->tcft_action; + opt.action = params->action; + + if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { + struct ip_tunnel_key *key = + ¶ms->tcft_enc_metadata->u.tun_info.key; + __be32 key_id = tunnel_id_to_key32(key->tun_id); + + if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || + tunnel_key_dump_addresses(skb, + ¶ms->tcft_enc_metadata->u.tun_info)) + goto nla_put_failure; + } + + tcf_tm_dump(&tm, &t->tcf_tm); + if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), + &tm, TCA_TUNNEL_KEY_PAD)) + goto nla_put_failure; + + ret = skb->len; + goto out; + +nla_put_failure: + nlmsg_trim(skb, b); +out: + rcu_read_unlock(); + + return ret; +} + +static int tunnel_key_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops); +} + +static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_tunnel_key_ops = { + .kind = "tunnel_key", + .type = TCA_ACT_TUNNEL_KEY, + .owner = THIS_MODULE, + .act = tunnel_key_act, + .dump = tunnel_key_dump, + .init = tunnel_key_init, + .cleanup = tunnel_key_release, + .walk = tunnel_key_walker, + .lookup = tunnel_key_search, + .size = sizeof(struct tcf_tunnel_key), +}; + +static __net_init int tunnel_key_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK); +} + +static void __net_exit tunnel_key_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations tunnel_key_net_ops = { + .init = tunnel_key_init_net, + .exit = tunnel_key_exit_net, + .id = &tunnel_key_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init tunnel_key_init_module(void) +{ + return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops); +} + +static void __exit tunnel_key_cleanup_module(void) +{ + tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops); +} + +module_init(tunnel_key_init_module); +module_exit(tunnel_key_cleanup_module); + +MODULE_AUTHOR("Amir Vadai "); +MODULE_DESCRIPTION("ip tunnel manipulation actions"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 6b6adee3dad25bbe568ee24fc843372d02fb425f Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Fri, 9 Sep 2016 17:35:18 +0300 Subject: net/mlx5: SRIOV core code refactoring Simplify the code and makes it look modular and symmetric. Split sriov enable/disable to two levels: device level and pci level. When user enable/disable sriov (via sriov_configure driver callback) we will enable/disable both device and pci sriov. When driver load/unload we will enable/disable (on demand) only device sriov while keeping the PCI sriov enabled for next driver load. On internal/pci error, VFs will be kept enabled on PCI and the reset is done only in device level. Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 12 +- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 + drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 216 +++++++++------------ include/linux/mlx5/driver.h | 2 - 4 files changed, 101 insertions(+), 131 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c132ef1faefe..baba53fb58b7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1180,8 +1180,7 @@ out: return 0; err_sriov: - if (mlx5_sriov_cleanup(dev)) - dev_err(&dev->pdev->dev, "sriov cleanup failed\n"); + mlx5_sriov_cleanup(dev); #ifdef CONFIG_MLX5_CORE_EN mlx5_eswitch_cleanup(dev->priv.eswitch); @@ -1241,19 +1240,14 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) { int err = 0; - err = mlx5_sriov_cleanup(dev); - if (err) { - dev_warn(&dev->pdev->dev, "%s: sriov cleanup failed - abort\n", - __func__); - return err; - } - mutex_lock(&dev->intf_state_mutex); if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) { dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n", __func__); goto out; } + + mlx5_sriov_cleanup(dev); mlx5_unregister_device(dev); #ifdef CONFIG_MLX5_CORE_EN mlx5_eswitch_cleanup(dev->priv.eswitch); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 714b71bed2be..7dd14cf73181 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -89,6 +89,8 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param); void mlx5_enter_error_state(struct mlx5_core_dev *dev); void mlx5_disable_device(struct mlx5_core_dev *dev); +int mlx5_sriov_init(struct mlx5_core_dev *dev); +void mlx5_sriov_cleanup(struct mlx5_core_dev *dev); int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs); bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev); int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 78e789245183..72a8215a2d02 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -44,108 +44,132 @@ bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev) return !!sriov->num_vfs; } -static void enable_vfs(struct mlx5_core_dev *dev, int num_vfs) +static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; int err; int vf; - for (vf = 1; vf <= num_vfs; vf++) { - err = mlx5_core_enable_hca(dev, vf); + if (sriov->enabled_vfs) { + mlx5_core_warn(dev, + "failed to enable SRIOV on device, already enabled with %d vfs\n", + sriov->enabled_vfs); + return -EBUSY; + } + +#ifdef CONFIG_MLX5_CORE_EN + err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY); + if (err) { + mlx5_core_warn(dev, + "failed to enable eswitch SRIOV (%d)\n", err); + return err; + } +#endif + + for (vf = 0; vf < num_vfs; vf++) { + err = mlx5_core_enable_hca(dev, vf + 1); if (err) { - mlx5_core_warn(dev, "failed to enable VF %d\n", vf - 1); - } else { - sriov->vfs_ctx[vf - 1].enabled = 1; - mlx5_core_dbg(dev, "successfully enabled VF %d\n", vf - 1); + mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err); + continue; } + sriov->vfs_ctx[vf].enabled = 1; + sriov->enabled_vfs++; + mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf); + } + + return 0; } -static void disable_vfs(struct mlx5_core_dev *dev, int num_vfs) +static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; + int err; int vf; - for (vf = 1; vf <= num_vfs; vf++) { - if (sriov->vfs_ctx[vf - 1].enabled) { - if (mlx5_core_disable_hca(dev, vf)) - mlx5_core_warn(dev, "failed to disable VF %d\n", vf - 1); - else - sriov->vfs_ctx[vf - 1].enabled = 0; + if (!sriov->enabled_vfs) + return; + + for (vf = 0; vf < sriov->num_vfs; vf++) { + if (!sriov->vfs_ctx[vf].enabled) + continue; + err = mlx5_core_disable_hca(dev, vf + 1); + if (err) { + mlx5_core_warn(dev, "failed to disable VF %d\n", vf); + continue; } + sriov->vfs_ctx[vf].enabled = 0; + sriov->enabled_vfs--; } + +#ifdef CONFIG_MLX5_CORE_EN + mlx5_eswitch_disable_sriov(dev->priv.eswitch); +#endif + + if (mlx5_wait_for_vf_pages(dev)) + mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); } -static int mlx5_core_create_vfs(struct pci_dev *pdev, int num_vfs) +static int mlx5_pci_enable_sriov(struct pci_dev *pdev, int num_vfs) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); - int err; - - if (pci_num_vf(pdev)) - pci_disable_sriov(pdev); - - enable_vfs(dev, num_vfs); + int err = 0; - err = pci_enable_sriov(pdev, num_vfs); - if (err) { - dev_warn(&pdev->dev, "enable sriov failed %d\n", err); - goto ex; + if (pci_num_vf(pdev)) { + mlx5_core_warn(dev, "Unable to enable pci sriov, already enabled\n"); + return -EBUSY; } - return 0; + err = pci_enable_sriov(pdev, num_vfs); + if (err) + mlx5_core_warn(dev, "pci_enable_sriov failed : %d\n", err); -ex: - disable_vfs(dev, num_vfs); return err; } -static int mlx5_core_sriov_enable(struct pci_dev *pdev, int num_vfs) +static void mlx5_pci_disable_sriov(struct pci_dev *pdev) +{ + pci_disable_sriov(pdev); +} + +static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct mlx5_core_sriov *sriov = &dev->priv.sriov; - int err; + int err = 0; - kfree(sriov->vfs_ctx); - sriov->vfs_ctx = kcalloc(num_vfs, sizeof(*sriov->vfs_ctx), GFP_ATOMIC); - if (!sriov->vfs_ctx) - return -ENOMEM; + err = mlx5_device_enable_sriov(dev, num_vfs); + if (err) { + mlx5_core_warn(dev, "mlx5_device_enable_sriov failed : %d\n", err); + return err; + } - sriov->enabled_vfs = num_vfs; - err = mlx5_core_create_vfs(pdev, num_vfs); + err = mlx5_pci_enable_sriov(pdev, num_vfs); if (err) { - kfree(sriov->vfs_ctx); - sriov->vfs_ctx = NULL; + mlx5_core_warn(dev, "mlx5_pci_enable_sriov failed : %d\n", err); + mlx5_device_disable_sriov(dev); return err; } + sriov->num_vfs = num_vfs; + return 0; } -static void mlx5_core_init_vfs(struct mlx5_core_dev *dev, int num_vfs) +static void mlx5_sriov_disable(struct pci_dev *pdev) { + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct mlx5_core_sriov *sriov = &dev->priv.sriov; - sriov->num_vfs = num_vfs; -} - -static void mlx5_core_cleanup_vfs(struct mlx5_core_dev *dev) -{ - struct mlx5_core_sriov *sriov; - - sriov = &dev->priv.sriov; - disable_vfs(dev, sriov->num_vfs); - - if (mlx5_wait_for_vf_pages(dev)) - mlx5_core_warn(dev, "timeout claiming VFs pages\n"); - + mlx5_pci_disable_sriov(pdev); + mlx5_device_disable_sriov(dev); sriov->num_vfs = 0; } int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); - struct mlx5_core_sriov *sriov = &dev->priv.sriov; - int err; + int err = 0; mlx5_core_dbg(dev, "requested num_vfs %d\n", num_vfs); if (!mlx5_core_is_pf(dev)) @@ -156,92 +180,44 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs) return -EINVAL; } - mlx5_core_cleanup_vfs(dev); - - if (!num_vfs) { -#ifdef CONFIG_MLX5_CORE_EN - mlx5_eswitch_disable_sriov(dev->priv.eswitch); -#endif - kfree(sriov->vfs_ctx); - sriov->vfs_ctx = NULL; - if (!pci_vfs_assigned(pdev)) - pci_disable_sriov(pdev); - else - mlx5_core_info(dev, "unloading PF driver while leaving orphan VFs\n"); - return 0; - } - - err = mlx5_core_sriov_enable(pdev, num_vfs); - if (err) { - mlx5_core_warn(dev, "mlx5_core_sriov_enable failed %d\n", err); - return err; - } + if (num_vfs) + err = mlx5_sriov_enable(pdev, num_vfs); + else + mlx5_sriov_disable(pdev); - mlx5_core_init_vfs(dev, num_vfs); -#ifdef CONFIG_MLX5_CORE_EN - mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY); -#endif - - return num_vfs; -} - -static int sync_required(struct pci_dev *pdev) -{ - struct mlx5_core_dev *dev = pci_get_drvdata(pdev); - struct mlx5_core_sriov *sriov = &dev->priv.sriov; - int cur_vfs = pci_num_vf(pdev); - - if (cur_vfs != sriov->num_vfs) { - mlx5_core_warn(dev, "current VFs %d, registered %d - sync needed\n", - cur_vfs, sriov->num_vfs); - return 1; - } - - return 0; + return err ? err : num_vfs; } int mlx5_sriov_init(struct mlx5_core_dev *dev) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; struct pci_dev *pdev = dev->pdev; - int cur_vfs; + int total_vfs; if (!mlx5_core_is_pf(dev)) return 0; - if (!sync_required(dev->pdev)) - return 0; - - cur_vfs = pci_num_vf(pdev); - sriov->vfs_ctx = kcalloc(cur_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL); + total_vfs = pci_sriov_get_totalvfs(pdev); + sriov->num_vfs = pci_num_vf(pdev); + sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL); if (!sriov->vfs_ctx) return -ENOMEM; - sriov->enabled_vfs = cur_vfs; - - mlx5_core_init_vfs(dev, cur_vfs); -#ifdef CONFIG_MLX5_CORE_EN - if (cur_vfs) - mlx5_eswitch_enable_sriov(dev->priv.eswitch, cur_vfs, - SRIOV_LEGACY); -#endif - - enable_vfs(dev, cur_vfs); + /* If sriov VFs exist in PCI level, enable them in device level */ + if (!sriov->num_vfs) + return 0; + mlx5_device_enable_sriov(dev, sriov->num_vfs); return 0; } -int mlx5_sriov_cleanup(struct mlx5_core_dev *dev) +void mlx5_sriov_cleanup(struct mlx5_core_dev *dev) { - struct pci_dev *pdev = dev->pdev; - int err; + struct mlx5_core_sriov *sriov = &dev->priv.sriov; if (!mlx5_core_is_pf(dev)) - return 0; + return; - err = mlx5_core_sriov_configure(pdev, 0); - if (err) - return err; - - return 0; + mlx5_device_disable_sriov(dev); + kfree(sriov->vfs_ctx); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5cb9fa7aec61..0d7aedfce1d7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -828,8 +828,6 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); int mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); -int mlx5_sriov_init(struct mlx5_core_dev *dev); -int mlx5_sriov_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); -- cgit v1.2.3 From 737a234bb6384800a5b632be85c6b0ad6221d137 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Fri, 9 Sep 2016 17:35:19 +0300 Subject: net/mlx5: Introduce attach/detach to interface API Add attach/detach callbacks to interface API. This is crucial for implementing seamless reset flow which releases the hardware and it's resources upon detach while keeping software structures and state (e.g netdev) then reset and reallocate the hardware needed resources upon attach. Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 149 +++++++++++++++++++++---- include/linux/mlx5/driver.h | 2 + 2 files changed, 131 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index baba53fb58b7..108d8f2e73e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -81,6 +81,7 @@ struct mlx5_device_context { struct list_head list; struct mlx5_interface *intf; void *context; + unsigned long state; }; enum { @@ -778,6 +779,11 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev) return -ENOTSUPP; } +enum { + MLX5_INTERFACE_ADDED, + MLX5_INTERFACE_ATTACHED, +}; + static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv) { struct mlx5_device_context *dev_ctx; @@ -786,12 +792,15 @@ static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv) if (!mlx5_lag_intf_add(intf, priv)) return; - dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL); + dev_ctx = kzalloc(sizeof(*dev_ctx), GFP_KERNEL); if (!dev_ctx) return; - dev_ctx->intf = intf; + dev_ctx->intf = intf; dev_ctx->context = intf->add(dev); + set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state); + if (intf->attach) + set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state); if (dev_ctx->context) { spin_lock_irq(&priv->ctx_lock); @@ -802,21 +811,114 @@ static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv) } } +static struct mlx5_device_context *mlx5_get_device(struct mlx5_interface *intf, + struct mlx5_priv *priv) +{ + struct mlx5_device_context *dev_ctx; + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf == intf) + return dev_ctx; + return NULL; +} + static void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv) { struct mlx5_device_context *dev_ctx; struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); - list_for_each_entry(dev_ctx, &priv->ctx_list, list) - if (dev_ctx->intf == intf) { - spin_lock_irq(&priv->ctx_lock); - list_del(&dev_ctx->list); - spin_unlock_irq(&priv->ctx_lock); + dev_ctx = mlx5_get_device(intf, priv); + if (!dev_ctx) + return; + + spin_lock_irq(&priv->ctx_lock); + list_del(&dev_ctx->list); + spin_unlock_irq(&priv->ctx_lock); + + if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state)) + intf->remove(dev, dev_ctx->context); - intf->remove(dev, dev_ctx->context); - kfree(dev_ctx); + kfree(dev_ctx); +} + +static void mlx5_attach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv) +{ + struct mlx5_device_context *dev_ctx; + struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); + + dev_ctx = mlx5_get_device(intf, priv); + if (!dev_ctx) + return; + + if (intf->attach) { + if (test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state)) return; - } + intf->attach(dev, dev_ctx->context); + set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state); + } else { + if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state)) + return; + dev_ctx->context = intf->add(dev); + set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state); + } +} + +static void mlx5_attach_device(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_interface *intf; + + mutex_lock(&mlx5_intf_mutex); + list_for_each_entry(intf, &intf_list, list) + mlx5_attach_interface(intf, priv); + mutex_unlock(&mlx5_intf_mutex); +} + +static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv) +{ + struct mlx5_device_context *dev_ctx; + struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); + + dev_ctx = mlx5_get_device(intf, priv); + if (!dev_ctx) + return; + + if (intf->detach) { + if (!test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state)) + return; + intf->detach(dev, dev_ctx->context); + clear_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state); + } else { + if (!test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state)) + return; + intf->remove(dev, dev_ctx->context); + clear_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state); + } +} + +static void mlx5_detach_device(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_interface *intf; + + mutex_lock(&mlx5_intf_mutex); + list_for_each_entry(intf, &intf_list, list) + mlx5_detach_interface(intf, priv); + mutex_unlock(&mlx5_intf_mutex); +} + +static bool mlx5_device_registered(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv; + bool found = false; + + mutex_lock(&mlx5_intf_mutex); + list_for_each_entry(priv, &mlx5_dev_list, dev_list) + if (priv == &dev->priv) + found = true; + mutex_unlock(&mlx5_intf_mutex); + + return found; } static int mlx5_register_device(struct mlx5_core_dev *dev) @@ -1162,16 +1264,16 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) goto err_sriov; } - err = mlx5_register_device(dev); - if (err) { - dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err); - goto err_reg_dev; + if (mlx5_device_registered(dev)) { + mlx5_attach_device(dev); + } else { + err = mlx5_register_device(dev); + if (err) { + dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err); + goto err_reg_dev; + } } - err = request_module_nowait(MLX5_IB_MOD); - if (err) - pr_info("failed request module on %s\n", MLX5_IB_MOD); - clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state); set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); out: @@ -1247,12 +1349,13 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) goto out; } + if (mlx5_device_registered(dev)) + mlx5_detach_device(dev); + mlx5_sriov_cleanup(dev); - mlx5_unregister_device(dev); #ifdef CONFIG_MLX5_CORE_EN mlx5_eswitch_cleanup(dev->priv.eswitch); #endif - mlx5_cleanup_rl_table(dev); mlx5_cleanup_fs(dev); mlx5_cleanup_mkey_table(dev); @@ -1364,6 +1467,9 @@ static int init_one(struct pci_dev *pdev, dev_err(&pdev->dev, "mlx5_load_one failed with error code %d\n", err); goto clean_health; } + err = request_module_nowait(MLX5_IB_MOD); + if (err) + pr_info("failed request module on %s\n", MLX5_IB_MOD); err = devlink_register(devlink, &pdev->dev); if (err) @@ -1391,11 +1497,14 @@ static void remove_one(struct pci_dev *pdev) struct mlx5_priv *priv = &dev->priv; devlink_unregister(devlink); + mlx5_unregister_device(dev); + if (mlx5_unload_one(dev, priv)) { dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n"); mlx5_health_cleanup(dev); return; } + mlx5_health_cleanup(dev); mlx5_pci_close(dev, priv); pci_set_drvdata(pdev, NULL); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0d7aedfce1d7..85c4786427e4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -930,6 +930,8 @@ enum { struct mlx5_interface { void * (*add)(struct mlx5_core_dev *dev); void (*remove)(struct mlx5_core_dev *dev, void *context); + int (*attach)(struct mlx5_core_dev *dev, void *context); + void (*detach)(struct mlx5_core_dev *dev, void *context); void (*event)(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param); void * (*get_dev)(void *context); -- cgit v1.2.3 From 3a8963acc70e69606729404713cfa9a03b58b18c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 9 Sep 2016 12:45:24 -0700 Subject: Revert "hv_netvsc: make inline functions static" These functions are used by other code misc-next tree. This reverts commit 30d1de08c87ddde6f73936c3350e7e153988fe02. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 85 +-------------------------------------------- include/linux/hyperv.h | 84 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 84 deletions(-) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 2a9ccc4d9e3c..ff05b9b0837f 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -33,89 +33,6 @@ #include "hyperv_net.h" -/* - * An API to support in-place processing of incoming VMBUS packets. - */ -#define VMBUS_PKT_TRAILER 8 - -static struct vmpacket_descriptor * -get_next_pkt_raw(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 read_loc = ring_info->priv_read_index; - void *ring_buffer = hv_get_ring_buffer(ring_info); - struct vmpacket_descriptor *cur_desc; - u32 packetlen; - u32 dsize = ring_info->ring_datasize; - u32 delta = read_loc - ring_info->ring_buffer->read_index; - u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); - - if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) - return NULL; - - if ((read_loc + sizeof(*cur_desc)) > dsize) - return NULL; - - cur_desc = ring_buffer + read_loc; - packetlen = cur_desc->len8 << 3; - - /* - * If the packet under consideration is wrapping around, - * return failure. - */ - if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1)) - return NULL; - - return cur_desc; -} - -/* - * A helper function to step through packets "in-place" - * This API is to be called after each successful call - * get_next_pkt_raw(). - */ -static void put_pkt_raw(struct vmbus_channel *channel, - struct vmpacket_descriptor *desc) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 read_loc = ring_info->priv_read_index; - u32 packetlen = desc->len8 << 3; - u32 dsize = ring_info->ring_datasize; - - BUG_ON((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize); - - /* - * Include the packet trailer. - */ - ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; -} - -/* - * This call commits the read index and potentially signals the host. - * Here is the pattern for using the "in-place" consumption APIs: - * - * while (get_next_pkt_raw() { - * process the packet "in-place"; - * put_pkt_raw(); - * } - * if (packets processed in place) - * commit_rd_index(); - */ -static void commit_rd_index(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - /* - * Make sure all reads are done before we update the read index since - * the writer may start writing to the read area once the read index - * is updated. - */ - virt_rmb(); - ring_info->ring_buffer->read_index = ring_info->priv_read_index; - - if (hv_need_to_signal_on_read(ring_info)) - vmbus_set_event(channel); -} - /* * Switch the data path from the synthetic interface to the VF * interface. @@ -840,7 +757,7 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device, return msg_size; } -static int netvsc_send_pkt( +static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, struct netvsc_device *net_device, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b01c8c3dd531..5df444b1ac18 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1429,4 +1429,88 @@ static inline bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi) return false; } +/* + * An API to support in-place processing of incoming VMBUS packets. + */ +#define VMBUS_PKT_TRAILER 8 + +static inline struct vmpacket_descriptor * +get_next_pkt_raw(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + u32 read_loc = ring_info->priv_read_index; + void *ring_buffer = hv_get_ring_buffer(ring_info); + struct vmpacket_descriptor *cur_desc; + u32 packetlen; + u32 dsize = ring_info->ring_datasize; + u32 delta = read_loc - ring_info->ring_buffer->read_index; + u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); + + if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) + return NULL; + + if ((read_loc + sizeof(*cur_desc)) > dsize) + return NULL; + + cur_desc = ring_buffer + read_loc; + packetlen = cur_desc->len8 << 3; + + /* + * If the packet under consideration is wrapping around, + * return failure. + */ + if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1)) + return NULL; + + return cur_desc; +} + +/* + * A helper function to step through packets "in-place" + * This API is to be called after each successful call + * get_next_pkt_raw(). + */ +static inline void put_pkt_raw(struct vmbus_channel *channel, + struct vmpacket_descriptor *desc) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + u32 read_loc = ring_info->priv_read_index; + u32 packetlen = desc->len8 << 3; + u32 dsize = ring_info->ring_datasize; + + if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize) + BUG(); + /* + * Include the packet trailer. + */ + ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; +} + +/* + * This call commits the read index and potentially signals the host. + * Here is the pattern for using the "in-place" consumption APIs: + * + * while (get_next_pkt_raw() { + * process the packet "in-place"; + * put_pkt_raw(); + * } + * if (packets processed in place) + * commit_rd_index(); + */ +static inline void commit_rd_index(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + /* + * Make sure all reads are done before we update the read index since + * the writer may start writing to the read area once the read index + * is updated. + */ + virt_rmb(); + ring_info->ring_buffer->read_index = ring_info->priv_read_index; + + if (hv_need_to_signal_on_read(ring_info)) + vmbus_set_event(channel); +} + + #endif /* _HYPERV_H */ -- cgit v1.2.3 From 9ee0034b8f49aaaa7e7c2da8db1038915db99c19 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:09:52 -0700 Subject: net: flow: Add l3mdev flow update Add l3mdev hook to set FLOWI_FLAG_SKIP_NH_OIF flag and update oif/iif in flow struct if its oif or iif points to a device enslaved to an L3 Master device. Only 1 needs to be converted to match the l3mdev FIB rule. This moves the flow adjustment for l3mdev to a single point catching all lookups. It is redundant for existing hooks (those are removed in later patches) but is needed for missed lookups such as PMTU updates. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 6 ++++++ net/ipv4/fib_rules.c | 3 +++ net/ipv6/fib6_rules.c | 3 +++ net/l3mdev/l3mdev.c | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index e90095091aa0..81e175e80537 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -49,6 +49,8 @@ struct l3mdev_ops { int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); +void l3mdev_update_flow(struct net *net, struct flowi *fl); + int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { @@ -290,6 +292,10 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, { return 1; } +static inline +void l3mdev_update_flow(struct net *net, struct flowi *fl) +{ +} #endif #endif /* _NET_L3MDEV_H_ */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 6e9ea69e5f75..770bebed6b28 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, }; int err; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi4_to_flowi(flp)); + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); #ifdef CONFIG_IP_ROUTE_CLASSID if (arg.rule) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 5857c1fc8b67..eea23b57c6a5 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index c4a1c3e84e12..43610e5acc4e 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -222,3 +222,38 @@ out: return rc; } + +void l3mdev_update_flow(struct net *net, struct flowi *fl) +{ + struct net_device *dev; + int ifindex; + + rcu_read_lock(); + + if (fl->flowi_oif) { + dev = dev_get_by_index_rcu(net, fl->flowi_oif); + if (dev) { + ifindex = l3mdev_master_ifindex_rcu(dev); + if (ifindex) { + fl->flowi_oif = ifindex; + fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; + goto out; + } + } + } + + if (fl->flowi_iif) { + dev = dev_get_by_index_rcu(net, fl->flowi_iif); + if (dev) { + ifindex = l3mdev_master_ifindex_rcu(dev); + if (ifindex) { + fl->flowi_iif = ifindex; + fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; + } + } + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(l3mdev_update_flow); -- cgit v1.2.3 From a8e3e1a9f02094145580ea7920c6a1d9aabd5539 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:09:53 -0700 Subject: net: l3mdev: Add hook to output path This patch adds the infrastructure to the output path to pass an skb to an l3mdev device if it has a hook registered. This is the Tx parallel to l3mdev_ip{6}_rcv in the receive path and is the basis for removing the existing hook that returns the vrf dst on the fib lookup. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 8 ++++++++ net/ipv6/ip6_output.c | 8 ++++++++ net/ipv6/output_core.c | 7 +++++++ net/ipv6/raw.c | 7 +++++++ 5 files changed, 78 insertions(+) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 81e175e80537..53d5274920e3 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -11,6 +11,7 @@ #ifndef _NET_L3MDEV_H_ #define _NET_L3MDEV_H_ +#include #include /** @@ -18,6 +19,10 @@ * * @l3mdev_fib_table: Get FIB table id to use for lookups * + * @l3mdev_l3_rcv: Hook in L3 receive path + * + * @l3mdev_l3_out: Hook in L3 output path + * * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device * * @l3mdev_get_saddr: Get source address for a flow @@ -29,6 +34,9 @@ struct l3mdev_ops { u32 (*l3mdev_fib_table)(const struct net_device *dev); struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev, struct sk_buff *skb, u16 proto); + struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev, + struct sock *sk, struct sk_buff *skb, + u16 proto); /* IPv4 ops */ struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev, @@ -201,6 +209,34 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) return l3mdev_l3_rcv(skb, AF_INET6); } +static inline +struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) +{ + struct net_device *dev = skb_dst(skb)->dev; + + if (netif_is_l3_slave(dev)) { + struct net_device *master; + + master = netdev_master_upper_dev_get_rcu(dev); + if (master && master->l3mdev_ops->l3mdev_l3_out) + skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, + skb, proto); + } + + return skb; +} + +static inline +struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) +{ + return l3mdev_l3_out(sk, skb, AF_INET); +} + +static inline +struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) +{ + return l3mdev_l3_out(sk, skb, AF_INET6); +} #else static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) @@ -286,6 +322,18 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) return skb; } +static inline +struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) +{ + return skb; +} + +static inline +struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) +{ + return skb; +} + static inline int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b913f5bf0757..41e10e34769c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) iph->tot_len = htons(skb->len); ip_send_check(iph); + + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_out(sk, skb); + if (unlikely(!skb)) + return 0; + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 993fd9666f1b..6ea6caace3a8 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -236,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); + + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out((struct sock *)sk, skb); + if (unlikely(!skb)) + return 0; + /* hooks should never assume socket lock is held. * we promote our socket to non const */ diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 462f2a76b5c2..7cca8ac66fe9 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out(sk, skb); + if (unlikely(!skb)) + return 0; + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 590dd1f7746f..54404f08efcc 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (err) goto error_fault; + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out(sk, skb); + if (unlikely(!skb)) + return 0; + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); -- cgit v1.2.3 From 5f02ce24c2696fec33f2a5dfcf753996f5fdd211 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:09:54 -0700 Subject: net: l3mdev: Allow the l3mdev to be a loopback Allow an L3 master device to act as the loopback for that L3 domain. For IPv4 the device can also have the address 127.0.0.1. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 6 +++--- net/ipv4/route.c | 8 ++++++-- net/ipv6/route.c | 12 ++++++++++-- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 53d5274920e3..3ee110518584 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -90,7 +90,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) } static inline -const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) +struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) { /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. @@ -99,7 +99,7 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) * typecast to remove the const */ struct net_device *dev = (struct net_device *)_dev; - const struct net_device *master; + struct net_device *master; if (!dev) return NULL; @@ -254,7 +254,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) } static inline -const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) +struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) { return NULL; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3e992783c1d0..f49b2c534e92 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2018,7 +2018,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + if (ipv4_is_loopback(fl4->saddr) && + !(dev_out->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) @@ -2302,7 +2304,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, else fl4->saddr = fl4->daddr; } - dev_out = net->loopback_dev; + + /* L3 master device is the loopback for that domain */ + dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 09d43ff11a8d..2c681113c055 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2558,8 +2558,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { u32 tb_id; struct net *net = dev_net(idev->dev); - struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, - DST_NOCOUNT); + struct net_device *dev = net->loopback_dev; + struct rt6_info *rt; + + /* use L3 Master device as loopback for host routes if device + * is enslaved and address is not link local or multicast + */ + if (!rt6_need_strict(addr)) + dev = l3mdev_master_dev_rcu(idev->dev) ? : dev; + + rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 4c1feac58e06270321cc500b85c2d94a11495775 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:09:56 -0700 Subject: net: vrf: Flip IPv6 output path from FIB lookup hook to out hook Flip the IPv6 output path to use the l3mdev tx out hook. The VRF dst is not returned on the first FIB lookup. Instead, the dst on the skb is switched at the beginning of the IPv6 output processing to send the packet to the VRF driver on xmit. Link scope addresses (linklocal and multicast) need special handling: specifically the oif the flow struct can not be changed because we want the lookup tied to the enslaved interface. ie., the source address and the returned route MUST point to the interface scope passed in. Convert the existing vrf_get_rt6_dst to handle only link scope addresses. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 124 ++++++++++++++++++++++++++++++++++----------------- include/net/l3mdev.h | 8 ++-- net/ipv6/route.c | 11 +++-- net/l3mdev/l3mdev.c | 15 +++---- 4 files changed, 100 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 08540b96ec18..f5372edf6edc 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -137,6 +137,20 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) +static int vrf_ip6_local_out(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, + sk, skb, NULL, skb_dst(skb)->dev, dst_output); + + if (likely(err == 1)) + err = dst_output(net, sk, skb); + + return err; +} + static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { @@ -207,7 +221,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); - ret = ip6_local_out(net, skb->sk, skb); + ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else @@ -391,6 +405,43 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +/* set dst on skb to send packet to us via dev_xmit path. Allows + * packet to go through device based features such as qdisc, netfilter + * hooks and packet sockets with skb->dev set to vrf device. + */ +static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb) +{ + struct net_vrf *vrf = netdev_priv(vrf_dev); + struct dst_entry *dst = NULL; + struct rt6_info *rt6; + + /* don't divert link scope packets */ + if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) + return skb; + + rcu_read_lock(); + + rt6 = rcu_dereference(vrf->rt6); + if (likely(rt6)) { + dst = &rt6->dst; + dst_hold(dst); + } + + rcu_read_unlock(); + + if (unlikely(!dst)) { + vrf_tx_error(vrf_dev, skb); + return NULL; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + return skb; +} + /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { @@ -477,6 +528,13 @@ out: return rc; } #else +static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb) +{ + return skb; +} + static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } @@ -587,6 +645,8 @@ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); + case AF_INET6: + return vrf_ip6_out(vrf_dev, sk, skb); } return skb; @@ -1031,53 +1091,33 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, } #if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, - struct flowi6 *fl6) +/* send to link-local or multicast address via interface enslaved to + * VRF device. Force lookup to VRF table without changing flow struct + */ +static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, + struct flowi6 *fl6) { - bool need_strict = rt6_need_strict(&fl6->daddr); - struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); + int flags = RT6_LOOKUP_F_IFACE; struct dst_entry *dst = NULL; struct rt6_info *rt; - /* send to link-local or multicast address */ - if (need_strict) { - int flags = RT6_LOOKUP_F_IFACE; - - /* VRF device does not have a link-local address and - * sending packets to link-local or mcast addresses over - * a VRF device does not make sense - */ - if (fl6->flowi6_oif == dev->ifindex) { - struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst; - - dst_hold(dst); - return dst; - } - - if (!ipv6_addr_any(&fl6->saddr)) - flags |= RT6_LOOKUP_F_HAS_SADDR; - - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); - if (rt) - dst = &rt->dst; - - } else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { - - rcu_read_lock(); - - rt = rcu_dereference(vrf->rt6); - if (likely(rt)) { - dst = &rt->dst; - dst_hold(dst); - } - - rcu_read_unlock(); + /* VRF device does not have a link-local address and + * sending packets to link-local or mcast addresses over + * a VRF device does not make sense + */ + if (fl6->flowi6_oif == dev->ifindex) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + return dst; } - /* make sure oif is set to VRF device for lookup */ - if (!need_strict) - fl6->flowi6_oif = dev->ifindex; + if (!ipv6_addr_any(&fl6->saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + if (rt) + dst = &rt->dst; return dst; } @@ -1130,7 +1170,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) - .l3mdev_get_rt6_dst = vrf_get_rt6_dst, + .l3mdev_link_scope_lookup = vrf_link_scope_lookup, .l3mdev_get_saddr6 = vrf_get_saddr6, #endif }; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 3ee110518584..51aab20a4d0a 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -27,7 +27,7 @@ * * @l3mdev_get_saddr: Get source address for a flow * - * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device + * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ struct l3mdev_ops { @@ -45,7 +45,7 @@ struct l3mdev_ops { struct flowi4 *fl4); /* IPv6 ops */ - struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev, + struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); int (*l3mdev_get_saddr6)(struct net_device *dev, const struct sock *sk, @@ -177,7 +177,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6); +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); int l3mdev_get_saddr6(struct net *net, const struct sock *sk, struct flowi6 *fl6); @@ -299,7 +299,7 @@ static inline int l3mdev_get_saddr(struct net *net, int ifindex, } static inline -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6) +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { return NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2c681113c055..87e0a01ce744 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1188,12 +1188,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { - struct dst_entry *dst; bool any_src; - dst = l3mdev_get_rt6_dst(net, fl6); - if (dst) - return dst; + if (rt6_need_strict(&fl6->daddr)) { + struct dst_entry *dst; + + dst = l3mdev_link_scope_lookup(net, fl6); + if (dst) + return dst; + } fl6->flowi6_iif = LOOPBACK_IFINDEX; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 43610e5acc4e..ac9d928d0a9e 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); /** - * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns - * cached route for L3 master device if relevant - * to flow + * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link + * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup */ -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - struct flowi6 *fl6) +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; @@ -121,15 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, dev = netdev_master_upper_dev_get_rcu(dev); if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_rt6_dst) - dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6); + dev->l3mdev_ops->l3mdev_link_scope_lookup) + dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); rcu_read_unlock(); } return dst; } -EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst); +EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); /** * l3mdev_get_saddr - get source address for a flow based on an interface -- cgit v1.2.3 From d66f6c0a8f3c0bcc4ee7a9b1da4b0ebe7ee555a3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:09:58 -0700 Subject: net: ipv4: Remove l3mdev_get_saddr No longer needed Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 38 -------------------------------------- include/net/l3mdev.h | 12 ------------ include/net/route.h | 10 ---------- net/ipv4/raw.c | 6 ------ net/ipv4/udp.c | 6 ------ net/l3mdev/l3mdev.c | 31 ------------------------------- 6 files changed, 103 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f5372edf6edc..9ad2a169485f 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -863,43 +863,6 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev, return rth; } -/* called under rcu_read_lock */ -static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) -{ - struct fib_result res = { .tclassid = 0 }; - struct net *net = dev_net(dev); - u32 orig_tos = fl4->flowi4_tos; - u8 flags = fl4->flowi4_flags; - u8 scope = fl4->flowi4_scope; - u8 tos = RT_FL_TOS(fl4); - int rc; - - if (unlikely(!fl4->daddr)) - return 0; - - fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; - fl4->flowi4_iif = LOOPBACK_IFINDEX; - /* make sure oif is set to VRF device for lookup */ - fl4->flowi4_oif = dev->ifindex; - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - fl4->flowi4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); - - rc = fib_lookup(net, fl4, &res, 0); - if (!rc) { - if (res.type == RTN_LOCAL) - fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; - else - fib_select_path(net, &res, fl4, -1); - } - - fl4->flowi4_flags = flags; - fl4->flowi4_tos = orig_tos; - fl4->flowi4_scope = scope; - - return rc; -} - static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return 0; @@ -1166,7 +1129,6 @@ static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk, static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_get_rtable = vrf_get_rtable, - .l3mdev_get_saddr = vrf_get_saddr, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 51aab20a4d0a..1129e1d8cd6e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -25,8 +25,6 @@ * * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device * - * @l3mdev_get_saddr: Get source address for a flow - * * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ @@ -41,8 +39,6 @@ struct l3mdev_ops { /* IPv4 ops */ struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev, const struct flowi4 *fl4); - int (*l3mdev_get_saddr)(struct net_device *dev, - struct flowi4 *fl4); /* IPv6 ops */ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, @@ -175,8 +171,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) return rc; } -int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); - struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); int l3mdev_get_saddr6(struct net *net, const struct sock *sk, struct flowi6 *fl6); @@ -292,12 +286,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) return false; } -static inline int l3mdev_get_saddr(struct net *net, int ifindex, - struct flowi4 *fl4) -{ - return 0; -} - static inline struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { diff --git a/include/net/route.h b/include/net/route.h index ad777d79af94..0429d47cad25 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -285,15 +284,6 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, ip_route_connect_init(fl4, dst, src, tos, oif, protocol, sport, dport, sk); - if (!src && oif) { - int rc; - - rc = l3mdev_get_saddr(net, oif, fl4); - if (rc < 0) - return ERR_PTR(rc); - - src = fl4->saddr; - } if (!dst || !src) { rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 438f50c1a676..90a85c955872 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); - if (!saddr && ipc.oif) { - err = l3mdev_get_saddr(net, ipc.oif, &fl4); - if (err < 0) - goto done; - } - if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 058c31286ce1..7d96dc2d3d08 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1021,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flow_flags, faddr, saddr, dport, inet->inet_sport); - if (!saddr && ipc.oif) { - err = l3mdev_get_saddr(net, ipc.oif, fl4); - if (err < 0) - goto out; - } - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index ac9d928d0a9e..be40df60703c 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -130,37 +130,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, } EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); -/** - * l3mdev_get_saddr - get source address for a flow based on an interface - * enslaved to an L3 master device - * @net: network namespace for device index lookup - * @ifindex: Interface index - * @fl4: IPv4 flow struct - */ - -int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) -{ - struct net_device *dev; - int rc = 0; - - if (ifindex) { - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, ifindex); - if (dev && netif_is_l3_slave(dev)) - dev = netdev_master_upper_dev_get_rcu(dev); - - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_saddr) - rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4); - - rcu_read_unlock(); - } - - return rc; -} -EXPORT_SYMBOL_GPL(l3mdev_get_saddr); - int l3mdev_get_saddr6(struct net *net, const struct sock *sk, struct flowi6 *fl6) { -- cgit v1.2.3 From 8a966fc016b67d2a8ab4a83d22ded8cde032a0eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:09:59 -0700 Subject: net: ipv6: Remove l3mdev_get_saddr6 No longer needed Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 41 ----------------------------------------- include/net/l3mdev.h | 11 ----------- net/ipv6/ip6_output.c | 9 +-------- net/l3mdev/l3mdev.c | 24 ------------------------ 4 files changed, 1 insertion(+), 84 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9ad2a169485f..3a34f547c578 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1084,46 +1084,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, return dst; } - -/* called under rcu_read_lock */ -static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk, - struct flowi6 *fl6) -{ - struct net *net = dev_net(dev); - struct dst_entry *dst; - struct rt6_info *rt; - int err; - - if (rt6_need_strict(&fl6->daddr)) { - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, - RT6_LOOKUP_F_IFACE); - if (unlikely(!rt)) - return 0; - - dst = &rt->dst; - } else { - __u8 flags = fl6->flowi6_flags; - - fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; - fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF; - - dst = ip6_route_output(net, sk, fl6); - rt = (struct rt6_info *)dst; - - fl6->flowi6_flags = flags; - } - - err = dst->error; - if (!err) { - err = ip6_route_get_saddr(net, rt, &fl6->daddr, - sk ? inet6_sk(sk)->srcprefs : 0, - &fl6->saddr); - } - - dst_release(dst); - - return err; -} #endif static const struct l3mdev_ops vrf_l3mdev_ops = { @@ -1133,7 +1093,6 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, - .l3mdev_get_saddr6 = vrf_get_saddr6, #endif }; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 1129e1d8cd6e..a5e506eb51de 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -43,9 +43,6 @@ struct l3mdev_ops { /* IPv6 ops */ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); - int (*l3mdev_get_saddr6)(struct net_device *dev, - const struct sock *sk, - struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV @@ -172,8 +169,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) } struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); -int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) @@ -292,12 +287,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) return NULL; } -static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6) -{ - return 0; -} - static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1cb41b365048..6001e781164e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -926,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; - if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && - (!*dst || !(*dst)->error)) { - err = l3mdev_get_saddr6(net, sk, fl6); - if (err) - goto out_err; - } - /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -1024,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, out_err_release: dst_release(*dst); *dst = NULL; -out_err: + if (err == -ENETUNREACH) IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index be40df60703c..8da86ceca33d 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -130,30 +130,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, } EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); -int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6) -{ - struct net_device *dev; - int rc = 0; - - if (fl6->flowi6_oif) { - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); - if (dev && netif_is_l3_slave(dev)) - dev = netdev_master_upper_dev_get_rcu(dev); - - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_saddr6) - rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); - - rcu_read_unlock(); - } - - return rc; -} -EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); - /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device -- cgit v1.2.3 From ca28b8f2b8f316b9973693c72770c98da3e9500e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:10:00 -0700 Subject: net: l3mdev: Remove l3mdev_fib_oif No longer used Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index a5e506eb51de..a586035c97cb 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -107,26 +107,6 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) return master; } -/* get index of an interface to use for FIB lookups. For devices - * enslaved to an L3 master device FIB lookups are based on the - * master index - */ -static inline int l3mdev_fib_oif_rcu(struct net_device *dev) -{ - return l3mdev_master_ifindex_rcu(dev) ? : dev->ifindex; -} - -static inline int l3mdev_fib_oif(struct net_device *dev) -{ - int oif; - - rcu_read_lock(); - oif = l3mdev_fib_oif_rcu(dev); - rcu_read_unlock(); - - return oif; -} - u32 l3mdev_fib_table_rcu(const struct net_device *dev); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); static inline u32 l3mdev_fib_table(const struct net_device *dev) @@ -248,15 +228,6 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) return NULL; } -static inline int l3mdev_fib_oif_rcu(struct net_device *dev) -{ - return dev ? dev->ifindex : 0; -} -static inline int l3mdev_fib_oif(struct net_device *dev) -{ - return dev ? dev->ifindex : 0; -} - static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev) { return 0; -- cgit v1.2.3 From afb460fe0ef0af6d98ed51006153acb01278df2d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:10:01 -0700 Subject: net: l3mdev: remove get_rtable method No longer used Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 21 --------------------- include/net/l3mdev.h | 21 --------------------- 2 files changed, 42 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 3a34f547c578..ccce59fbb2b3 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -843,26 +843,6 @@ static u32 vrf_fib_table(const struct net_device *dev) return vrf->tb_id; } -static struct rtable *vrf_get_rtable(const struct net_device *dev, - const struct flowi4 *fl4) -{ - struct rtable *rth = NULL; - - if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { - struct net_vrf *vrf = netdev_priv(dev); - - rcu_read_lock(); - - rth = rcu_dereference(vrf->rth); - if (likely(rth)) - dst_hold(&rth->dst); - - rcu_read_unlock(); - } - - return rth; -} - static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return 0; @@ -1088,7 +1068,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, - .l3mdev_get_rtable = vrf_get_rtable, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index a586035c97cb..3832099289c5 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -23,8 +23,6 @@ * * @l3mdev_l3_out: Hook in L3 output path * - * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device - * * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ @@ -36,10 +34,6 @@ struct l3mdev_ops { struct sock *sk, struct sk_buff *skb, u16 proto); - /* IPv4 ops */ - struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev, - const struct flowi4 *fl4); - /* IPv6 ops */ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); @@ -120,15 +114,6 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev) return tb_id; } -static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev, - const struct flowi4 *fl4) -{ - if (netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_get_rtable) - return dev->l3mdev_ops->l3mdev_get_rtable(dev, fl4); - - return NULL; -} - static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { struct net_device *dev; @@ -241,12 +226,6 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) return 0; } -static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev, - const struct flowi4 *fl4) -{ - return NULL; -} - static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { return false; -- cgit v1.2.3 From c71ad3d45a5e928e617ca436f3ce88bb773fb766 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 10 Sep 2016 12:10:02 -0700 Subject: net: flow: Remove FLOWI_FLAG_L3MDEV_SRC flag No longer used Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 ++--- include/net/flow.h | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index ccce59fbb2b3..55674b0e65b7 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -165,7 +165,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, - .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, + .flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF, }; int ret = NET_XMIT_DROP; struct dst_entry *dst; @@ -265,8 +265,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, .flowi4_oif = vrf_dev->ifindex, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | - FLOWI_FLAG_SKIP_NH_OIF, + .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF, .daddr = ip4h->daddr, }; struct net *net = dev_net(vrf_dev); diff --git a/include/net/flow.h b/include/net/flow.h index d47ef4bb5423..035aa7716967 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -34,8 +34,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 -#define FLOWI_FLAG_L3MDEV_SRC 0x04 -#define FLOWI_FLAG_SKIP_NH_OIF 0x08 +#define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; -- cgit v1.2.3 From 5a1f044b5048e834f936fbb33a93e5d8410779ec Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 29 Aug 2016 23:25:14 +0300 Subject: cfg80211: clarify the requirements of .disconnect() cfg80211 expects the .disconnect() handler to call cfg80211_disconnect() when done. Make this requirement more explicit. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9c23f4d33e06..d5e7f690bad9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2423,7 +2423,8 @@ struct cfg80211_qos_map { * cases, the result of roaming is indicated with a call to * cfg80211_roamed() or cfg80211_roamed_bss(). * (invoked with the wireless_dev mutex held) - * @disconnect: Disconnect from the BSS/ESS. + * @disconnect: Disconnect from the BSS/ESS. Once done, call + * cfg80211_disconnected(). * (invoked with the wireless_dev mutex held) * * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call -- cgit v1.2.3 From 480dd46b9d6812e5fb7172c305ee0f1154c26eed Mon Sep 17 00:00:00 2001 From: Maxim Altshul Date: Mon, 22 Aug 2016 17:14:04 +0300 Subject: mac80211: RX BA support for sta max_rx_aggregation_subframes The ability to change the max_rx_aggregation frames is useful in cases of IOP. There exist some devices (latest mobile phones and some AP's) that tend to not respect a BA sessions maximum size (in Kbps). These devices won't respect the AMPDU size that was negotiated during association (even though they do respect the maximal number of packets). This violation is characterized by a valid number of packets in a single AMPDU. Even so, the total size will exceed the size negotiated during association. Eventually, this will cause some undefined behavior, which in turn causes the hw to drop packets, causing the throughput to plummet. This patch will make the subframe limitation to be held by each station, instead of being held only by hw. Signed-off-by: Maxim Altshul Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/agg-rx.c | 7 +++++-- net/mac80211/sta_info.c | 3 +++ 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index cca510a585c3..a1457ca2a30c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1735,6 +1735,9 @@ struct ieee80211_sta_rates { * @supp_rates: Bitmap of supported rates (per band) * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities + * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU + * that this station is allowed to transmit to us. + * Can be modified by driver. * @wme: indicates whether the STA supports QoS/WME (if local devices does, * otherwise always false) * @drv_priv: data area for driver use, will always be aligned to @@ -1775,6 +1778,7 @@ struct ieee80211_sta { u16 aid; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + u8 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index a9aff6079c42..282e99bdb301 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -298,10 +298,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, buf_size = IEEE80211_MAX_AMPDU_BUF; /* make sure the size doesn't exceed the maximum supported by the hw */ - if (buf_size > local->hw.max_rx_aggregation_subframes) - buf_size = local->hw.max_rx_aggregation_subframes; + if (buf_size > sta->sta.max_rx_aggregation_subframes) + buf_size = sta->sta.max_rx_aggregation_subframes; params.buf_size = buf_size; + ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n", + buf_size, sta->sta.addr); + /* examine state machine */ mutex_lock(&sta->ampdu_mlme.mtx); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 19f14c907d74..5e70fa52e1ff 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -340,6 +340,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); + sta->sta.max_rx_aggregation_subframes = + local->hw.max_rx_aggregation_subframes; + sta->local = local; sta->sdata = sdata; sta->rx_stats.last_rx = jiffies; -- cgit v1.2.3 From 99ee7cae3bf3ce04e90d7b193d9f4f59a7044d91 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Aug 2016 23:25:17 +0300 Subject: mac80211: add support for radiotap timestamp field Use the existing device timestamp from the RX status information to add support for the new radiotap timestamp field. Currently only 32-bit counters are supported, but we also add the radiotap mactime where applicable. This new field allows more flexibility in where the timestamp is taken etc. The non-timestamp data in the field is taken from a new field in the hw struct. Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 21 +++++++++++++++++++++ include/net/mac80211.h | 12 ++++++++++++ net/mac80211/main.c | 3 +++ net/mac80211/rx.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+) (limited to 'include') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index b0fd9476c538..ba07b9d8ed63 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -190,6 +190,10 @@ struct ieee80211_radiotap_header { * IEEE80211_RADIOTAP_VHT u16, u8, u8, u8[4], u8, u8, u16 * * Contains VHT information about this frame. + * + * IEEE80211_RADIOTAP_TIMESTAMP u64, u16, u8, u8 variable + * + * Contains timestamp information for this frame. */ enum ieee80211_radiotap_type { IEEE80211_RADIOTAP_TSFT = 0, @@ -214,6 +218,7 @@ enum ieee80211_radiotap_type { IEEE80211_RADIOTAP_MCS = 19, IEEE80211_RADIOTAP_AMPDU_STATUS = 20, IEEE80211_RADIOTAP_VHT = 21, + IEEE80211_RADIOTAP_TIMESTAMP = 22, /* valid in every it_present bitmap, even vendor namespaces */ IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29, @@ -321,6 +326,22 @@ enum ieee80211_radiotap_type { #define IEEE80211_RADIOTAP_CODING_LDPC_USER2 0x04 #define IEEE80211_RADIOTAP_CODING_LDPC_USER3 0x08 +/* For IEEE80211_RADIOTAP_TIMESTAMP */ +#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK 0x000F +#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MS 0x0000 +#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_US 0x0001 +#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_NS 0x0003 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_MASK 0x00F0 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_BEGIN_MDPU 0x0000 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU 0x0010 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_PPDU 0x0020 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ 0x0030 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_UNKNOWN 0x00F0 + +#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT 0x00 +#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT 0x01 +#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY 0x02 + /* helpers */ static inline int ieee80211_get_radiotap_len(unsigned char *data) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a1457ca2a30c..08bac23c8de1 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2145,6 +2145,14 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_* values. * + * @radiotap_timestamp: Information for the radiotap timestamp field; if the + * 'units_pos' member is set to a non-negative value it must be set to + * a combination of a IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a + * IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value, and then the timestamp + * field will be added and populated from the &struct ieee80211_rx_status + * device_timestamp. If the 'accuracy' member is non-negative, it's put + * into the accuracy radiotap field and the accuracy known flag is set. + * * @netdev_features: netdev features to be set in each netdev created * from this HW. Note that not all features are usable with mac80211, * other features will be rejected during HW registration. @@ -2188,6 +2196,10 @@ struct ieee80211_hw { u8 offchannel_tx_hw_queue; u8 radiotap_mcs_details; u16 radiotap_vht_details; + struct { + int units_pos; + s16 accuracy; + } radiotap_timestamp; netdev_features_t netdev_features; u8 uapsd_queues; u8 uapsd_max_sp_len; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index d00ea9b13f49..ac053a9df36d 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -660,6 +660,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, ieee80211_roc_setup(local); + local->hw.radiotap_timestamp.units_pos = -1; + local->hw.radiotap_timestamp.accuracy = -1; + return &local->hw; err_free: wiphy_free(wiphy); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6a265aa73a46..284f0f25e22e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 12; } + if (local->hw.radiotap_timestamp.units_pos >= 0) { + len = ALIGN(len, 8); + len += 12; + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos += 2; } + if (local->hw.radiotap_timestamp.units_pos >= 0) { + u16 accuracy = 0; + u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT; + + rthdr->it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP); + + /* ensure 8 byte alignment */ + while ((pos - (u8 *)rthdr) & 7) + pos++; + + put_unaligned_le64(status->device_timestamp, pos); + pos += sizeof(u64); + + if (local->hw.radiotap_timestamp.accuracy >= 0) { + accuracy = local->hw.radiotap_timestamp.accuracy; + flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY; + } + put_unaligned_le16(accuracy, pos); + pos += sizeof(u16); + + *pos++ = local->hw.radiotap_timestamp.units_pos; + *pos++ = flags; + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; -- cgit v1.2.3 From 70ca767ea1b2748f45e96192400e515dddbe517c Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 6 Sep 2016 08:44:19 +0200 Subject: netfilter: nft_hash: Add hash offset value Add support to pass through an offset to the hash value. With this feature, the sysadmin is able to generate a hash with a given offset value. Example: meta mark set jhash ip saddr mod 2 seed 0xabcd offset 100 This option generates marks according to the source address from 100 to 101. Signed-off-by: Laura Garcia Liebana --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_hash.c | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 24161e25576d..8c653bbd1ead 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -731,6 +731,7 @@ enum nft_meta_keys { * @NFTA_HASH_LEN: source data length (NLA_U32) * @NFTA_HASH_MODULUS: modulus value (NLA_U32) * @NFTA_HASH_SEED: seed value (NLA_U32) + * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32) */ enum nft_hash_attributes { NFTA_HASH_UNSPEC, @@ -739,6 +740,7 @@ enum nft_hash_attributes { NFTA_HASH_LEN, NFTA_HASH_MODULUS, NFTA_HASH_SEED, + NFTA_HASH_OFFSET, __NFTA_HASH_MAX, }; #define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 764251d31e46..bd12f7a801c2 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -23,6 +23,7 @@ struct nft_hash { u8 len; u32 modulus; u32 seed; + u32 offset; }; static void nft_hash_eval(const struct nft_expr *expr, @@ -31,10 +32,10 @@ static void nft_hash_eval(const struct nft_expr *expr, { struct nft_hash *priv = nft_expr_priv(expr); const void *data = ®s->data[priv->sreg]; + u32 h; - regs->data[priv->dreg] = - reciprocal_scale(jhash(data, priv->len, priv->seed), - priv->modulus); + h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus); + regs->data[priv->dreg] = h + priv->offset; } static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { @@ -59,6 +60,9 @@ static int nft_hash_init(const struct nft_ctx *ctx, !tb[NFTA_HASH_MODULUS]) return -EINVAL; + if (tb[NFTA_HASH_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); + priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); @@ -72,6 +76,9 @@ static int nft_hash_init(const struct nft_ctx *ctx, if (priv->modulus <= 1) return -ERANGE; + if (priv->offset + priv->modulus - 1 < U32_MAX) + return -EOVERFLOW; + priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); return nft_validate_register_load(priv->sreg, len) && @@ -94,7 +101,9 @@ static int nft_hash_dump(struct sk_buff *skb, goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed))) goto nla_put_failure; - + if (priv->offset != 0) + if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.3 From dbd2be0646e3239022630c426cbceefa15714bca Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Sep 2016 12:22:18 +0200 Subject: netfilter: nft_dynset: allow to invert match criteria The dynset expression matches if we can fit a new entry into the set. If there is no room for it, then it breaks the rule evaluation. This patch introduces the inversion flag so you can add rules to explicitly drop packets that don't fit into the set. For example: # nft filter input flow table xyz size 4 { ip saddr timeout 120s counter } overflow drop This is useful to provide a replacement for connlimit. For the rule above, every new entry uses the IPv4 address as key in the set, this entry gets a timeout of 120 seconds that gets refresh on every packet seen. If we get new flow and our set already contains 4 entries already, then this packet is dropped. You can already express this in positive logic, assuming default policy to drop: # nft filter input flow table xyz size 4 { ip saddr timeout 10s counter } accept Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nft_dynset.c | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8c653bbd1ead..bc0eb6a1066d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -575,6 +575,10 @@ enum nft_dynset_ops { NFT_DYNSET_OP_UPDATE, }; +enum nft_dynset_flags { + NFT_DYNSET_F_INV = (1 << 0), +}; + /** * enum nft_dynset_attributes - dynset expression attributes * @@ -585,6 +589,7 @@ enum nft_dynset_ops { * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_DYNSET_FLAGS: flags (NLA_U32) */ enum nft_dynset_attributes { NFTA_DYNSET_UNSPEC, @@ -596,6 +601,7 @@ enum nft_dynset_attributes { NFTA_DYNSET_TIMEOUT, NFTA_DYNSET_EXPR, NFTA_DYNSET_PAD, + NFTA_DYNSET_FLAGS, __NFTA_DYNSET_MAX, }; #define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 0af26699bf04..e3b83c31da2e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -22,6 +22,7 @@ struct nft_dynset { enum nft_dynset_ops op:8; enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; + bool invert; u64 timeout; struct nft_expr *expr; struct nft_set_binding binding; @@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr, if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); + + if (priv->invert) + regs->verdict.code = NFT_BREAK; return; } out: - regs->verdict.code = NFT_BREAK; + if (!priv->invert) + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { @@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, + [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; + if (tb[NFTA_DYNSET_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); + + if (flags & ~NFT_DYNSET_F_INV) + return -EINVAL; + if (flags & NFT_DYNSET_F_INV) + priv->invert = true; + } + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], genmask); if (IS_ERR(set)) { @@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_dynset *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; @@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.3 From beac5afa2d78605b70f40cf5ab5601ab10659c7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:49 +0200 Subject: netfilter: nf_tables: ensure proper initialization of nft_pktinfo fields This patch introduces nft_set_pktinfo_unspec() that ensures proper initialization all of pktinfo fields for non-IP traffic. This is used by the bridge, netdev and arp families. This new function relies on nft_set_pktinfo_proto_unspec() to set a new tprot_set field that indicates if transport protocol information is available. Remain fields are zeroed. The meta expression has been also updated to check to tprot_set in first place given that zero is a valid tprot value. Even a handcrafted packet may come with the IPPROTO_RAW (255) protocol number so we can't rely on this value as tprot unset. Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 18 ++++++++++++++++++ include/net/netfilter/nf_tables_ipv4.h | 1 + include/net/netfilter/nf_tables_ipv6.h | 1 + net/bridge/netfilter/nf_tables_bridge.c | 6 +++--- net/ipv4/netfilter/nf_tables_arp.c | 2 +- net/netfilter/nf_tables_netdev.c | 4 +++- net/netfilter/nft_meta.c | 2 ++ 7 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8972468bc94b..a7a7cebc8d07 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -19,6 +19,7 @@ struct nft_pktinfo { const struct net_device *out; u8 pf; u8 hook; + bool tprot_set; u8 tprot; /* for x_tables compatibility */ struct xt_action_param xt; @@ -36,6 +37,23 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, pkt->pf = pkt->xt.family = state->pf; } +static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt, + struct sk_buff *skb) +{ + pkt->tprot_set = false; + pkt->tprot = 0; + pkt->xt.thoff = 0; + pkt->xt.fragoff = 0; +} + +static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, skb, state); + nft_set_pktinfo_proto_unspec(pkt, skb); +} + /** * struct nft_verdict - nf_tables verdict * diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index ca6ef6bf775e..af952f7843ee 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -14,6 +14,7 @@ nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, nft_set_pktinfo(pkt, skb, state); ip = ip_hdr(pkt->skb); + pkt->tprot_set = true; pkt->tprot = ip->protocol; pkt->xt.thoff = ip_hdrlen(pkt->skb); pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET; diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 8ad39a6a5fe1..6aeee47b1b5e 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -19,6 +19,7 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, if (protohdr < 0) return -1; + pkt->tprot_set = true; pkt->tprot = protohdr; pkt->xt.thoff = thoff; pkt->xt.fragoff = frag_off; diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index a78c4e2826e5..29899887163e 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -71,7 +71,7 @@ static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, if (nft_bridge_iphdr_validate(skb)) nft_set_pktinfo_ipv4(pkt, skb, state); else - nft_set_pktinfo(pkt, skb, state); + nft_set_pktinfo_unspec(pkt, skb, state); } static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, @@ -83,7 +83,7 @@ static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, nft_set_pktinfo_ipv6(pkt, skb, state) == 0) return; #endif - nft_set_pktinfo(pkt, skb, state); + nft_set_pktinfo_unspec(pkt, skb, state); } static unsigned int @@ -101,7 +101,7 @@ nft_do_chain_bridge(void *priv, nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); break; } diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index cd84d4295a20..058c034be376 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); return nft_do_chain(&pkt, priv); } diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 5eefe4a355c6..8de502b0c37b 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -41,6 +41,7 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, else if (len < thoff) return; + pkt->tprot_set = true; pkt->tprot = iph->protocol; pkt->xt.thoff = thoff; pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; @@ -74,6 +75,7 @@ __nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, if (protohdr < 0) return; + pkt->tprot_set = true; pkt->tprot = protohdr; pkt->xt.thoff = thoff; pkt->xt.fragoff = frag_off; @@ -102,7 +104,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); break; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2863f3493038..14264edf2d77 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -52,6 +52,8 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = pkt->pf; break; case NFT_META_L4PROTO: + if (!pkt->tprot_set) + goto err; *dest = pkt->tprot; break; case NFT_META_PRIORITY: -- cgit v1.2.3 From 8df9e32e7ed2978f90cce780ce6a27513044158a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:50 +0200 Subject: netfilter: nf_tables_ipv6: setup pktinfo transport field on failure to parse Make sure the pktinfo protocol fields are initialized if this fails to parse the transport header. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 6aeee47b1b5e..1e0ffd5aea47 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -15,9 +15,10 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, nft_set_pktinfo(pkt, skb, state); protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); - /* If malformed, drop it */ - if (protohdr < 0) + if (protohdr < 0) { + nft_set_pktinfo_proto_unspec(pkt, skb); return -1; + } pkt->tprot_set = true; pkt->tprot = protohdr; -- cgit v1.2.3 From ddc8b6027ad08d145a6d7a6a6abc00e43f315bd1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:51 +0200 Subject: netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate() These functions are extracted from the netdev family, they initialize the pktinfo structure and validate that the IPv4 and IPv6 headers are well-formed given that these functions are called from a path where layer 3 sanitization did not happen yet. These functions are placed in include/net/netfilter/nf_tables_ipv{4,6}.h so they can be reused by a follow up patch to use them from the bridge family too. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv4.h | 42 ++++++++++++++++++ include/net/netfilter/nf_tables_ipv6.h | 49 +++++++++++++++++++++ net/netfilter/nf_tables_netdev.c | 79 +--------------------------------- 3 files changed, 93 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index af952f7843ee..968f00b82fb5 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -20,6 +20,48 @@ nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET; } +static inline int +__nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct iphdr *iph, _iph; + u32 len, thoff; + + iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), + &_iph); + if (!iph) + return -1; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return -1; + + len = ntohs(iph->tot_len); + thoff = iph->ihl * 4; + if (skb->len < len) + return -1; + else if (len < thoff) + return -1; + + pkt->tprot_set = true; + pkt->tprot = iph->protocol; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; + + return 0; +} + +static inline void +nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, skb, state); + if (__nft_set_pktinfo_ipv4_validate(pkt, skb, state) < 0) + nft_set_pktinfo_proto_unspec(pkt, skb); +} + extern struct nft_af_info nft_af_ipv4; #endif diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 1e0ffd5aea47..39b7b717b540 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -28,6 +28,55 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, return 0; } +static inline int +__nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + unsigned int thoff = 0; + unsigned short frag_off; + int protohdr; + u32 pkt_len; + + ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), + &_ip6h); + if (!ip6h) + return -1; + + if (ip6h->version != 6) + return -1; + + pkt_len = ntohs(ip6h->payload_len); + if (pkt_len + sizeof(*ip6h) > skb->len) + return -1; + + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + if (protohdr < 0) + return -1; + + pkt->tprot_set = true; + pkt->tprot = protohdr; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = frag_off; + + return 0; +#else + return -1; +#endif +} + +static inline void +nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, skb, state); + if (__nft_set_pktinfo_ipv6_validate(pkt, skb, state) < 0) + nft_set_pktinfo_proto_unspec(pkt, skb); +} + extern struct nft_af_info nft_af_ipv6; #endif diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 8de502b0c37b..3e5475a833a5 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -15,81 +15,6 @@ #include #include -static inline void -nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct iphdr *iph, _iph; - u32 len, thoff; - - nft_set_pktinfo(pkt, skb, state); - - iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), - &_iph); - if (!iph) - return; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return; - - len = ntohs(iph->tot_len); - thoff = iph->ihl * 4; - if (skb->len < len) - return; - else if (len < thoff) - return; - - pkt->tprot_set = true; - pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; -} - -static inline void -__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct ipv6hdr *ip6h, _ip6h; - unsigned int thoff = 0; - unsigned short frag_off; - int protohdr; - u32 pkt_len; - - ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), - &_ip6h); - if (!ip6h) - return; - - if (ip6h->version != 6) - return; - - pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) - return; - - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); - if (protohdr < 0) - return; - - pkt->tprot_set = true; - pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; -#endif -} - -static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - nft_set_pktinfo(pkt, skb, state); - __nft_netdev_set_pktinfo_ipv6(pkt, skb, state); -} - static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -98,10 +23,10 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb, state); break; default: nft_set_pktinfo_unspec(&pkt, skb, state); -- cgit v1.2.3 From 10151d7b03e23afce76a59f717f2616a10ddef86 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:52 +0200 Subject: netfilter: nf_tables_bridge: use nft_set_pktinfo_ipv{4, 6}_validate Consolidate pktinfo setup and validation by using the new generic functions so we converge to the netdev family codebase. We only need a linear IPv4 and IPv6 header from the reject expression, so move nft_bridge_iphdr_validate() and nft_bridge_ip6hdr_validate() to net/bridge/netfilter/nft_reject_bridge.c. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_bridge.h | 7 ---- net/bridge/netfilter/nf_tables_bridge.c | 72 +------------------------------- net/bridge/netfilter/nft_reject_bridge.c | 44 ++++++++++++++++++- 3 files changed, 45 insertions(+), 78 deletions(-) delete mode 100644 include/net/netfilter/nf_tables_bridge.h (limited to 'include') diff --git a/include/net/netfilter/nf_tables_bridge.h b/include/net/netfilter/nf_tables_bridge.h deleted file mode 100644 index 511fb79f6dad..000000000000 --- a/include/net/netfilter/nf_tables_bridge.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _NET_NF_TABLES_BRIDGE_H -#define _NET_NF_TABLES_BRIDGE_H - -int nft_bridge_iphdr_validate(struct sk_buff *skb); -int nft_bridge_ip6hdr_validate(struct sk_buff *skb); - -#endif /* _NET_NF_TABLES_BRIDGE_H */ diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 29899887163e..06f0f81456a0 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -13,79 +13,11 @@ #include #include #include -#include #include #include #include #include -int nft_bridge_iphdr_validate(struct sk_buff *skb) -{ - struct iphdr *iph; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return 0; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return 0; - - len = ntohs(iph->tot_len); - if (skb->len < len) - return 0; - else if (len < (iph->ihl*4)) - return 0; - - if (!pskb_may_pull(skb, iph->ihl*4)) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate); - -int nft_bridge_ip6hdr_validate(struct sk_buff *skb) -{ - struct ipv6hdr *hdr; - u32 pkt_len; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return 0; - - hdr = ipv6_hdr(skb); - if (hdr->version != 6) - return 0; - - pkt_len = ntohs(hdr->payload_len); - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate); - -static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (nft_bridge_iphdr_validate(skb)) - nft_set_pktinfo_ipv4(pkt, skb, state); - else - nft_set_pktinfo_unspec(pkt, skb, state); -} - -static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (nft_bridge_ip6hdr_validate(skb) && - nft_set_pktinfo_ipv6(pkt, skb, state) == 0) - return; -#endif - nft_set_pktinfo_unspec(pkt, skb, state); -} - static unsigned int nft_do_chain_bridge(void *priv, struct sk_buff *skb, @@ -95,10 +27,10 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_bridge_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb, state); break; default: nft_set_pktinfo_unspec(&pkt, skb, state); diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 0b77ffbc27d6..4b3df6b0e3b9 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, skb_pull(nskb, ETH_HLEN); } +static int nft_bridge_iphdr_validate(struct sk_buff *skb) +{ + struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return 0; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return 0; + + len = ntohs(iph->tot_len); + if (skb->len < len) + return 0; + else if (len < (iph->ihl*4)) + return 0; + + if (!pskb_may_pull(skb, iph->ihl*4)) + return 0; + + return 1; +} + /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ @@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net, br_forward(br_port_get_rcu(dev), nskb, false, true); } +static int nft_bridge_ip6hdr_validate(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return 0; + + pkt_len = ntohs(hdr->payload_len); + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + return 0; + + return 1; +} + static void nft_reject_br_send_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, -- cgit v1.2.3 From 71212c9b04eba76faa4dca26ccd1552d6bb300c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:53 +0200 Subject: netfilter: nf_tables: don't drop IPv6 packets that cannot parse transport This is overly conservative and not flexible at all, so better let them go through and let the filtering policy decide what to do with them. We use skb_header_pointer() all over the place so we would just fail to match when trying to access fields from malformed traffic. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 6 ++---- net/ipv6/netfilter/nf_tables_ipv6.c | 4 +--- net/ipv6/netfilter/nft_chain_route_ipv6.c | 4 +--- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 39b7b717b540..d150b5066201 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -4,7 +4,7 @@ #include #include -static inline int +static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) @@ -17,15 +17,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); if (protohdr < 0) { nft_set_pktinfo_proto_unspec(pkt, skb); - return -1; + return; } pkt->tprot_set = true; pkt->tprot = protohdr; pkt->xt.thoff = thoff; pkt->xt.fragoff = frag_off; - - return 0; } static inline int diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 30b22f4dff55..05d05926962a 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, { struct nft_pktinfo pkt; - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) - return NF_DROP; + nft_set_pktinfo_ipv6(&pkt, skb, state); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 71d995ff3108..01eb0f658366 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -32,9 +32,7 @@ static unsigned int nf_route_table_hook(void *priv, u_int8_t hop_limit; u32 mark, flowlabel; - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) - return NF_DROP; + nft_set_pktinfo_ipv6(&pkt, skb, state); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); -- cgit v1.2.3 From 6bd14303a908833e10ac731a3308eba938305269 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 11 Sep 2016 22:05:27 +0800 Subject: netfilter: nf_queue: get rid of dependency on IP6_NF_IPTABLES hash_v6 is used by both nftables and ip6tables, so depend on IP6_NF_IPTABLES is not properly. Actually, it only parses ipv6hdr and computes a hash value, so even if IPV6 is disabled, there's no side effect too, remove it. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 0dbce55437f2..cc8a11f7e306 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -54,7 +54,6 @@ static inline u32 hash_v4(const struct sk_buff *skb, u32 jhash_initval) (__force u32)iph->saddr, iph->protocol, jhash_initval); } -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -77,7 +76,6 @@ static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval) return jhash_3words(a, b, c, jhash_initval); } -#endif static inline u32 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, @@ -85,10 +83,8 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, { if (family == NFPROTO_IPV4) queue += ((u64) hash_v4(skb, jhash_initval) * queues_total) >> 32; -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) else if (family == NFPROTO_IPV6) queue += ((u64) hash_v6(skb, jhash_initval) * queues_total) >> 32; -#endif return queue; } -- cgit v1.2.3 From 8e8118f893138d4cc3d4dbf4163d7497fca54a9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 11 Sep 2016 22:55:53 +0200 Subject: netfilter: conntrack: remove packet hotpath stats These counters sit in hot path and do show up in perf, this is especially true for 'found' and 'searched' which get incremented for every packet processed. Information like searched=212030105 new=623431 found=333613 delete=623327 does not seem too helpful nowadays: - on busy systems found and searched will overflow every few hours (these are 32bit integers), other more busy ones every few days. - for debugging there are better methods, such as iptables' trace target, the conntrack log sysctls. Nowadays we also have perf tool. This removes packet path stat counters except those that are expected to be 0 (or close to 0) on a normal system, e.g. 'insert_failed' (race happened) or 'invalid' (proto tracker rejects). The insert stat is retained for the ctnetlink case. The found stat is retained for the tuple-is-taken check when NAT has to determine if it needs to pick a different source address. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 4 ---- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 8 ++++---- net/netfilter/nf_conntrack_core.c | 14 ++------------ net/netfilter/nf_conntrack_netlink.c | 6 +----- net/netfilter/nf_conntrack_standalone.c | 8 ++++---- 5 files changed, 11 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 275505792664..1d1ef4e20512 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -4,13 +4,9 @@ #include struct ip_conntrack_stat { - unsigned int searched; unsigned int found; - unsigned int new; unsigned int invalid; unsigned int ignore; - unsigned int delete; - unsigned int delete_list; unsigned int insert; unsigned int insert_failed; unsigned int drop; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 9df789709abe..6deb8867c5fc 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -231,13 +231,13 @@ enum ctattr_secctx { enum ctattr_stats_cpu { CTA_STATS_UNSPEC, - CTA_STATS_SEARCHED, + CTA_STATS_SEARCHED, /* no longer used */ CTA_STATS_FOUND, - CTA_STATS_NEW, + CTA_STATS_NEW, /* no longer used */ CTA_STATS_INVALID, CTA_STATS_IGNORE, - CTA_STATS_DELETE, - CTA_STATS_DELETE_LIST, + CTA_STATS_DELETE, /* no longer used */ + CTA_STATS_DELETE_LIST, /* no longer used */ CTA_STATS_INSERT, CTA_STATS_INSERT_FAILED, CTA_STATS_DROP, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ac1db4019d5c..8d1ddb9b63ed 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -379,7 +379,6 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); @@ -406,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_del_from_dying_or_unconfirmed_list(ct); - NF_CT_STAT_INC(net, delete); local_bh_enable(); if (ct->master) @@ -438,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) nf_ct_add_to_dying_list(ct); - NF_CT_STAT_INC(net, delete_list); local_bh_enable(); } @@ -529,11 +526,8 @@ begin: if (nf_ct_is_dying(ct)) continue; - if (nf_ct_key_equal(h, tuple, zone, net)) { - NF_CT_STAT_INC_ATOMIC(net, found); + if (nf_ct_key_equal(h, tuple, zone, net)) return h; - } - NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -798,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert); local_bh_enable(); help = nfct_help(ct); @@ -857,7 +850,6 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_unlock(); return 1; } - NF_CT_STAT_INC_ATOMIC(net, searched); } if (get_nulls_value(n) != hash) { @@ -1177,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock(&nf_conntrack_expect_lock); } - if (!exp) { + if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - NF_CT_STAT_INC(net, new); - } /* Now it is inserted into the unconfirmed list, bump refcount */ nf_conntrack_get(&ct->ct_general); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c052b712c49f..27540455dc62 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1984,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || - nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || - nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) || + if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || - nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) || - nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 3d9a316a3c77..7d52f8401afd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -352,13 +352,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - st->searched, + 0, st->found, - st->new, + 0, st->invalid, st->ignore, - st->delete, - st->delete_list, + 0, + 0, st->insert, st->insert_failed, st->drop, -- cgit v1.2.3 From c7e9dbcf09bddd01568113103d62423d8894eabd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Sep 2016 10:03:00 +0200 Subject: mac80211: remove sta_remove_debugfs driver callback No drivers implement this, relying either on the recursive directory removal to remove their debugfs, or not having any to start with. Remove the dead driver callback. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 ++--------- net/mac80211/debugfs_sta.c | 4 ---- net/mac80211/driver-ops.h | 15 --------------- 3 files changed, 2 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 08bac23c8de1..d9c8ccd6b4e6 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3101,11 +3101,8 @@ enum ieee80211_reconfig_type { * * @sta_add_debugfs: Drivers can use this callback to add debugfs files * when a station is added to mac80211's station list. This callback - * and @sta_remove_debugfs should be within a CONFIG_MAC80211_DEBUGFS - * conditional. This callback can sleep. - * - * @sta_remove_debugfs: Remove the debugfs files which were added using - * @sta_add_debugfs. This callback can sleep. + * should be within a CONFIG_MAC80211_DEBUGFS conditional. This + * callback can sleep. * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating @@ -3501,10 +3498,6 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); - void (*sta_remove_debugfs)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index fb2693582e40..a2fcdb47a0e6 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -544,10 +544,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) void ieee80211_sta_debugfs_remove(struct sta_info *sta) { - struct ieee80211_local *local = sta->local; - struct ieee80211_sub_if_data *sdata = sta->sdata; - - drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); debugfs_remove_recursive(sta->debugfs_dir); sta->debugfs_dir = NULL; } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index c39f93b48791..fe35a1c0dc86 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -499,21 +499,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local, local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } - -static inline void drv_sta_remove_debugfs(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, - struct dentry *dir) -{ - might_sleep(); - - sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); - - if (local->ops->sta_remove_debugfs) - local->ops->sta_remove_debugfs(&local->hw, &sdata->vif, - sta, dir); -} #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, -- cgit v1.2.3 From e8a24cd4b87247beedb1addc7b683422092047e5 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Wed, 14 Sep 2016 12:48:32 +0530 Subject: mac80211: allow driver to handle packet-loss mechanism Based on consecutive msdu failures, mac80211 triggers CQM packet-loss mechanism. Drivers like ath10k that have its own connection monitoring algorithm, offloaded to firmware for triggering station kickout. In case of station kickout, driver will report low ack status by mac80211 API (ieee80211_report_low_ack). This flag will enable the driver to completely rely on firmware events for station kickout and bypass mac80211 packet loss mechanism. Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/debugfs.c | 1 + net/mac80211/status.c | 6 ++++++ 3 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d9c8ccd6b4e6..5296100f3889 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2018,6 +2018,11 @@ struct ieee80211_txq { * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list * skbs, needed for zero-copy software A-MSDU. * + * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event + * by ieee80211_report_low_ack() based on its own algorithm. For such + * drivers, mac80211 packet loss mechanism will not be triggered and driver + * is completely depending on firmware event for station kickout. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2058,6 +2063,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_USES_RSS, IEEE80211_HW_TX_AMSDU, IEEE80211_HW_TX_FRAG_LIST, + IEEE80211_HW_REPORTS_LOW_ACK, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 5bbb470f335f..8ca62b6bb02a 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -201,6 +201,7 @@ static const char *hw_flag_names[] = { FLAG(USES_RSS), FLAG(TX_AMSDU), FLAG(TX_FRAG_LIST), + FLAG(REPORTS_LOW_ACK), #undef FLAG }; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index fabd9ff710d9..ea39f8a7baf3 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { + /* If driver relies on its own algorithm for station kickout, skip + * mac80211 packet loss mechanism. + */ + if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) + return; + /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) -- cgit v1.2.3 From 86da71b57383d40993cb90baafb3735cffe5d800 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 12 Sep 2016 20:13:09 -0400 Subject: net_sched: Introduce skbmod action This action is intended to be an upgrade from a usability perspective from pedit (as well as operational debugability). Compare this: sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \ u32 match ip protocol 1 0xff flowid 1:2 \ action pedit munge offset -14 u8 set 0x02 \ munge offset -13 u8 set 0x15 \ munge offset -12 u8 set 0x15 \ munge offset -11 u8 set 0x15 \ munge offset -10 u16 set 0x1515 \ pipe to: sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \ u32 match ip protocol 1 0xff flowid 1:2 \ action skbmod dmac 02:15:15:15:15:15 Also try to do a MAC address swap with pedit or worse try to debug a policy with destination mac, source mac and etherype. Then make few rules out of those and you'll get my point. In the future common use cases on pedit can be migrated to this action (as an example different fields in ip v4/6, transports like tcp/udp/sctp etc). For this first cut, this allows modifying basic ethernet header. The most important ethernet use case at the moment is when redirecting or mirroring packets to a remote machine. The dst mac address needs a re-write so that it doesnt get dropped or confuse an interconnecting (learning) switch or dropped by a target machine (which looks at the dst mac). And at times when flipping back the packet a swap of the MAC addresses is needed. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbmod.h | 30 ++++ include/uapi/linux/tc_act/tc_skbmod.h | 39 +++++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/act_skbmod.c | 301 ++++++++++++++++++++++++++++++++++ 5 files changed, 382 insertions(+) create mode 100644 include/net/tc_act/tc_skbmod.h create mode 100644 include/uapi/linux/tc_act/tc_skbmod.h create mode 100644 net/sched/act_skbmod.c (limited to 'include') diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h new file mode 100644 index 000000000000..644a2116b47b --- /dev/null +++ b/include/net/tc_act/tc_skbmod.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016, Jamal Hadi Salim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. +*/ + +#ifndef __NET_TC_SKBMOD_H +#define __NET_TC_SKBMOD_H + +#include +#include + +struct tcf_skbmod_params { + struct rcu_head rcu; + u64 flags; /*up to 64 types of operations; extend if needed */ + u8 eth_dst[ETH_ALEN]; + u16 eth_type; + u8 eth_src[ETH_ALEN]; +}; + +struct tcf_skbmod { + struct tc_action common; + struct tcf_skbmod_params __rcu *skbmod_p; +}; +#define to_skbmod(a) ((struct tcf_skbmod *)a) + +#endif /* __NET_TC_SKBMOD_H */ diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h new file mode 100644 index 000000000000..10fc07da6c69 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_skbmod.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, Jamal Hadi Salim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. +*/ + +#ifndef __LINUX_TC_SKBMOD_H +#define __LINUX_TC_SKBMOD_H + +#include + +#define TCA_ACT_SKBMOD 15 + +#define SKBMOD_F_DMAC 0x1 +#define SKBMOD_F_SMAC 0x2 +#define SKBMOD_F_ETYPE 0x4 +#define SKBMOD_F_SWAPMAC 0x8 + +struct tc_skbmod { + tc_gen; + __u64 flags; +}; + +enum { + TCA_SKBMOD_UNSPEC, + TCA_SKBMOD_TM, + TCA_SKBMOD_PARMS, + TCA_SKBMOD_DMAC, + TCA_SKBMOD_SMAC, + TCA_SKBMOD_ETYPE, + TCA_SKBMOD_PAD, + __TCA_SKBMOD_MAX +}; +#define TCA_SKBMOD_MAX (__TCA_SKBMOD_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 72e3426fa48f..7795d5a3f79a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -749,6 +749,17 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_SKBMOD + tristate "skb data modification action" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow modification of skb data + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_skbmod. + config NET_ACT_IFE tristate "Inter-FE action based on IETF ForCES InterFE LFB" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index b9d046b9535a..148ae0d5ac2c 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c new file mode 100644 index 000000000000..e7d96381c908 --- /dev/null +++ b/net/sched/act_skbmod.c @@ -0,0 +1,301 @@ +/* + * net/sched/act_skbmod.c skb data modifier + * + * Copyright (c) 2016 Jamal Hadi Salim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SKBMOD_TAB_MASK 15 + +static int skbmod_net_id; +static struct tc_action_ops act_skbmod_ops; + +#define MAX_EDIT_LEN ETH_HLEN +static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_skbmod *d = to_skbmod(a); + int action; + struct tcf_skbmod_params *p; + u64 flags; + int err; + + tcf_lastuse_update(&d->tcf_tm); + bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + /* XXX: if you are going to edit more fields beyond ethernet header + * (example when you add IP header replacement or vlan swap) + * then MAX_EDIT_LEN needs to change appropriately + */ + err = skb_ensure_writable(skb, MAX_EDIT_LEN); + if (unlikely(err)) { /* best policy is to drop on the floor */ + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + return TC_ACT_SHOT; + } + + rcu_read_lock(); + action = READ_ONCE(d->tcf_action); + if (unlikely(action == TC_ACT_SHOT)) { + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + rcu_read_unlock(); + return action; + } + + p = rcu_dereference(d->skbmod_p); + flags = p->flags; + if (flags & SKBMOD_F_DMAC) + ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); + if (flags & SKBMOD_F_SMAC) + ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src); + if (flags & SKBMOD_F_ETYPE) + eth_hdr(skb)->h_proto = p->eth_type; + rcu_read_unlock(); + + if (flags & SKBMOD_F_SWAPMAC) { + u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */ + /*XXX: I am sure we can come up with more efficient swapping*/ + ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest); + ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source); + ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr); + } + + return action; +} + +static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { + [TCA_SKBMOD_PARMS] = { .len = sizeof(struct tc_skbmod) }, + [TCA_SKBMOD_DMAC] = { .len = ETH_ALEN }, + [TCA_SKBMOD_SMAC] = { .len = ETH_ALEN }, + [TCA_SKBMOD_ETYPE] = { .type = NLA_U16 }, +}; + +static int tcf_skbmod_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + struct nlattr *tb[TCA_SKBMOD_MAX + 1]; + struct tcf_skbmod_params *p, *p_old; + struct tc_skbmod *parm; + struct tcf_skbmod *d; + bool exists = false; + u8 *daddr = NULL; + u8 *saddr = NULL; + u16 eth_type = 0; + u32 lflags = 0; + int ret = 0, err; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy); + if (err < 0) + return err; + + if (!tb[TCA_SKBMOD_PARMS]) + return -EINVAL; + + if (tb[TCA_SKBMOD_DMAC]) { + daddr = nla_data(tb[TCA_SKBMOD_DMAC]); + lflags |= SKBMOD_F_DMAC; + } + + if (tb[TCA_SKBMOD_SMAC]) { + saddr = nla_data(tb[TCA_SKBMOD_SMAC]); + lflags |= SKBMOD_F_SMAC; + } + + if (tb[TCA_SKBMOD_ETYPE]) { + eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]); + lflags |= SKBMOD_F_ETYPE; + } + + parm = nla_data(tb[TCA_SKBMOD_PARMS]); + if (parm->flags & SKBMOD_F_SWAPMAC) + lflags = SKBMOD_F_SWAPMAC; + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (!lflags) + return -EINVAL; + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + &act_skbmod_ops, bind, true); + if (ret) + return ret; + + ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; + } + + d = to_skbmod(*a); + + ASSERT_RTNL(); + p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); + if (unlikely(!p)) { + if (ovr) + tcf_hash_release(*a, bind); + return -ENOMEM; + } + + p->flags = lflags; + d->tcf_action = parm->action; + + p_old = rtnl_dereference(d->skbmod_p); + + if (ovr) + spin_lock_bh(&d->tcf_lock); + + if (lflags & SKBMOD_F_DMAC) + ether_addr_copy(p->eth_dst, daddr); + if (lflags & SKBMOD_F_SMAC) + ether_addr_copy(p->eth_src, saddr); + if (lflags & SKBMOD_F_ETYPE) + p->eth_type = htons(eth_type); + + rcu_assign_pointer(d->skbmod_p, p); + if (ovr) + spin_unlock_bh(&d->tcf_lock); + + if (p_old) + kfree_rcu(p_old, rcu); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, *a); + return ret; +} + +static void tcf_skbmod_cleanup(struct tc_action *a, int bind) +{ + struct tcf_skbmod *d = to_skbmod(a); + struct tcf_skbmod_params *p; + + p = rcu_dereference_protected(d->skbmod_p, 1); + kfree_rcu(p, rcu); +} + +static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + struct tcf_skbmod *d = to_skbmod(a); + unsigned char *b = skb_tail_pointer(skb); + struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); + struct tc_skbmod opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; + struct tcf_t t; + + opt.flags = p->flags; + if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((p->flags & SKBMOD_F_DMAC) && + nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst)) + goto nla_put_failure; + if ((p->flags & SKBMOD_F_SMAC) && + nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src)) + goto nla_put_failure; + if ((p->flags & SKBMOD_F_ETYPE) && + nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type))) + goto nla_put_failure; + + tcf_tm_dump(&t, &d->tcf_tm); + if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) + goto nla_put_failure; + + return skb->len; +nla_put_failure: + rcu_read_unlock(); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops); +} + +static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_skbmod_ops = { + .kind = "skbmod", + .type = TCA_ACT_SKBMOD, + .owner = THIS_MODULE, + .act = tcf_skbmod_run, + .dump = tcf_skbmod_dump, + .init = tcf_skbmod_init, + .cleanup = tcf_skbmod_cleanup, + .walk = tcf_skbmod_walker, + .lookup = tcf_skbmod_search, + .size = sizeof(struct tcf_skbmod), +}; + +static __net_init int skbmod_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK); +} + +static void __net_exit skbmod_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations skbmod_net_ops = { + .init = skbmod_init_net, + .exit = skbmod_exit_net, + .id = &skbmod_net_id, + .size = sizeof(struct tc_action_net), +}; + +MODULE_AUTHOR("Jamal Hadi Salim, "); +MODULE_DESCRIPTION("SKB data mod-ing"); +MODULE_LICENSE("GPL"); + +static int __init skbmod_init_module(void) +{ + return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops); +} + +static void __exit skbmod_cleanup_module(void) +{ + tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops); +} + +module_init(skbmod_init_module); +module_exit(skbmod_cleanup_module); -- cgit v1.2.3 From aa72d708373dacfa690960b336543b867784b350 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 15 Sep 2016 15:28:22 +0300 Subject: net/sched: cls_flower: Support masking for matching on tcp/udp ports Add the definitions for src/dst udp/tcp port masks and use them when setting && dumping the relevant keys. Signed-off-by: Or Gerlitz Signed-off-by: Paul Blakey Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 4 ++++ net/sched/cls_flower.c | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index f9c287c67eae..60ea2a084880 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -442,6 +442,10 @@ enum { TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b084b2aab2d7..027523c82797 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -335,6 +335,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -432,17 +436,17 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (key->basic.ip_proto == IPPROTO_TCP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)); } else if (key->basic.ip_proto == IPPROTO_UDP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)); } @@ -877,18 +881,18 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (key->basic.ip_proto == IPPROTO_TCP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; else if (key->basic.ip_proto == IPPROTO_UDP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; -- cgit v1.2.3 From 37a6c1512314d2439ef7136d773d5a470e0996b9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 15 Sep 2016 15:28:24 +0300 Subject: net/sched: cls_flower: Specify vlan attributes format in the UAPI header Specify the format (size and endianess) for the vlan attributes. Signed-off-by: Or Gerlitz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 60ea2a084880..8915b61bbf83 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -428,9 +428,9 @@ enum { TCA_FLOWER_KEY_UDP_DST, /* be16 */ TCA_FLOWER_FLAGS, - TCA_FLOWER_KEY_VLAN_ID, - TCA_FLOWER_KEY_VLAN_PRIO, - TCA_FLOWER_KEY_VLAN_ETH_TYPE, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ -- cgit v1.2.3 From cafdc45c949b9963cbfb8fe3a68d0ab16b0208ce Mon Sep 17 00:00:00 2001 From: John Crispin Date: Thu, 15 Sep 2016 16:26:40 +0200 Subject: net-next: dsa: add Qualcomm tag RX/TX handler Add support for the 2-bytes Qualcomm tag that gigabit switches such as the QCA8337/N might insert when receiving packets, or that we need to insert while targeting specific switch ports. The tag is inserted directly behind the ethernet header. Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: John Crispin Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/Kconfig | 3 ++ net/dsa/Makefile | 1 + net/dsa/dsa.c | 3 ++ net/dsa/dsa_priv.h | 2 + net/dsa/tag_qca.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 148 insertions(+) create mode 100644 net/dsa/tag_qca.c (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9d97c5214341..7556646db2d3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -26,6 +26,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_TRAILER, DSA_TAG_PROTO_EDSA, DSA_TAG_PROTO_BRCM, + DSA_TAG_PROTO_QCA, DSA_TAG_LAST, /* MUST BE LAST */ }; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index ff7736f7ff42..96e47c539bee 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA config NET_DSA_TAG_TRAILER bool +config NET_DSA_TAG_QCA + bool + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 8af4ded70f1c..a3380ed0e0be 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o +dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index d8d267e9a872..66e31acfcad8 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -53,6 +53,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #endif #ifdef CONFIG_NET_DSA_TAG_BRCM [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_QCA + [DSA_TAG_PROTO_QCA] = &qca_netdev_ops, #endif [DSA_TAG_PROTO_NONE] = &none_ops, }; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 00077a9c97f4..6cfd7388834e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops; /* tag_brcm.c */ extern const struct dsa_device_ops brcm_netdev_ops; +/* tag_qca.c */ +extern const struct dsa_device_ops qca_netdev_ops; #endif diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c new file mode 100644 index 000000000000..0c90cacee7aa --- /dev/null +++ b/net/dsa/tag_qca.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include "dsa_priv.h" + +#define QCA_HDR_LEN 2 +#define QCA_HDR_VERSION 0x2 + +#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14) +#define QCA_HDR_RECV_VERSION_S 14 +#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11) +#define QCA_HDR_RECV_PRIORITY_S 11 +#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6) +#define QCA_HDR_RECV_TYPE_S 6 +#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) +#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) + +#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14) +#define QCA_HDR_XMIT_VERSION_S 14 +#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11) +#define QCA_HDR_XMIT_PRIORITY_S 11 +#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8) +#define QCA_HDR_XMIT_CONTROL_S 8 +#define QCA_HDR_XMIT_FROM_CPU BIT(7) +#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0) + +static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u16 *phdr, hdr; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + if (skb_cow_head(skb, 0) < 0) + goto out_free; + + skb_push(skb, QCA_HDR_LEN); + + memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); + phdr = (u16 *)(skb->data + 2 * ETH_ALEN); + + /* Set the version field, and set destination port information */ + hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | + QCA_HDR_XMIT_FROM_CPU | + BIT(p->port); + + *phdr = htons(hdr); + + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + +static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + u8 ver; + int port; + __be16 *phdr, hdr; + + if (unlikely(!dst)) + goto out_drop; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) + goto out_drop; + + /* The QCA header is added by the switch between src addr and Ethertype + * At this point, skb->data points to ethertype so header should be + * right before + */ + phdr = (__be16 *)(skb->data - 2); + hdr = ntohs(*phdr); + + /* Make sure the version is correct */ + ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S; + if (unlikely(ver != QCA_HDR_VERSION)) + goto out_drop; + + /* Remove QCA tag and recalculate checksum */ + skb_pull_rcsum(skb, QCA_HDR_LEN); + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, + ETH_HLEN - QCA_HDR_LEN); + + /* This protocol doesn't support cascading multiple switches so it's + * safe to assume the switch is first in the tree + */ + ds = dst->ds[0]; + if (!ds) + goto out_drop; + + /* Get source port information */ + port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); + if (!ds->ports[port].netdev) + goto out_drop; + + /* Update skb & forward the frame accordingly */ + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->dev = ds->ports[port].netdev; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +const struct dsa_device_ops qca_netdev_ops = { + .xmit = qca_tag_xmit, + .rcv = qca_tag_rcv, +}; -- cgit v1.2.3 From fbd05e4a6e82fd573d3aa79e284e424b8d78c149 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Thu, 15 Sep 2016 18:15:09 +0300 Subject: cfg80211: add helper to find an IE that matches a byte-array There are a few places where an IE that matches not only the EID, but also other bytes inside the element, needs to be found. To simplify that and reduce the amount of similar code, implement a new helper function to match the EID and an extra array of bytes. Additionally, simplify cfg80211_find_vendor_ie() by using the new match function. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 33 +++++++++++++++++++++++++++- net/wireless/scan.c | 58 +++++++++++++++++++++++--------------------------- 2 files changed, 59 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d5e7f690bad9..533cb6410678 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3946,6 +3946,34 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map); +/** + * cfg80211_find_ie_match - match information element and byte array in data + * + * @eid: element ID + * @ies: data consisting of IEs + * @len: length of data + * @match: byte array to match + * @match_len: number of bytes in the match array + * @match_offset: offset in the IE where the byte array should match. + * If match_len is zero, this must also be set to zero. + * Otherwise this must be set to 2 or more, because the first + * byte is the element id, which is already compared to eid, and + * the second byte is the IE length. + * + * Return: %NULL if the element ID could not be found or if + * the element is invalid (claims to be longer than the given + * data) or if the byte array doesn't match, or a pointer to the first + * byte of the requested element, that is the byte containing the + * element ID. + * + * Note: There are no checks on the element length other than + * having to fit into the given data and being large enough for the + * byte array to match. + */ +const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len, + const u8 *match, int match_len, + int match_offset); + /** * cfg80211_find_ie - find information element in data * @@ -3961,7 +3989,10 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, * Note: There are no checks on the element length other than * having to fit into the given data. */ -const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len); +static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) +{ + return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0); +} /** * cfg80211_find_vendor_ie - find vendor specific information element in data diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0358e12be54b..b5bd58d0f731 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -352,52 +352,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } -const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) +const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len, + const u8 *match, int match_len, + int match_offset) { - while (len > 2 && ies[0] != eid) { + /* match_offset can't be smaller than 2, unless match_len is + * zero, in which case match_offset must be zero as well. + */ + if (WARN_ON((match_len && match_offset < 2) || + (!match_len && match_offset))) + return NULL; + + while (len >= 2 && len >= ies[1] + 2) { + if ((ies[0] == eid) && + (ies[1] + 2 >= match_offset + match_len) && + !memcmp(ies + match_offset, match, match_len)) + return ies; + len -= ies[1] + 2; ies += ies[1] + 2; } - if (len < 2) - return NULL; - if (len < 2 + ies[1]) - return NULL; - return ies; + + return NULL; } -EXPORT_SYMBOL(cfg80211_find_ie); +EXPORT_SYMBOL(cfg80211_find_ie_match); const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type, const u8 *ies, int len) { - struct ieee80211_vendor_ie *ie; - const u8 *pos = ies, *end = ies + len; - int ie_oui; + const u8 *ie; + u8 match[] = { oui >> 16, oui >> 8, oui, oui_type }; + int match_len = (oui_type < 0) ? 3 : sizeof(match); if (WARN_ON(oui_type > 0xff)) return NULL; - while (pos < end) { - pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos, - end - pos); - if (!pos) - return NULL; - - ie = (struct ieee80211_vendor_ie *)pos; - - /* make sure we can access ie->len */ - BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1); + ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len, + match, match_len, 2); - if (ie->len < sizeof(*ie)) - goto cont; + if (ie && (ie[1] < 4)) + return NULL; - ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2]; - if (ie_oui == oui && - (oui_type < 0 || ie->oui_type == oui_type)) - return pos; -cont: - pos += 2 + ie->len; - } - return NULL; + return ie; } EXPORT_SYMBOL(cfg80211_find_vendor_ie); -- cgit v1.2.3 From a3868bfc8d5b0f36c784deab644ee1d2b0e6974b Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Print the packet type name in the Rx packet trace Print a symbolic packet type name for each valid received packet in the trace output, not just a number. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 5 +++-- net/rxrpc/ar-internal.h | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ea3b10ed91a8..0a30c673509c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -93,11 +93,12 @@ TRACE_EVENT(rxrpc_rx_packet, memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); ), - TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x", + TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s", __entry->hdr.epoch, __entry->hdr.cid, __entry->hdr.callNumber, __entry->hdr.serviceId, __entry->hdr.serial, __entry->hdr.seq, - __entry->hdr.type, __entry->hdr.flags) + __entry->hdr.type, __entry->hdr.flags, + __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK") ); TRACE_EVENT(rxrpc_rx_done, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e78c40b37db5..0f6fafa2c271 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -551,6 +551,9 @@ enum rxrpc_call_trace { extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; +extern const char *const rxrpc_pkts[]; +extern const char *rxrpc_acks(u8 reason); + #include /* @@ -851,11 +854,8 @@ extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; extern unsigned int rxrpc_resend_timeout; -extern const char *const rxrpc_pkts[]; extern const s8 rxrpc_ack_priority[]; -extern const char *rxrpc_acks(u8 reason); - /* * output.c */ -- cgit v1.2.3 From 363deeab6d0f308d33d011323661ae9cf5f9f8d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:14 +0100 Subject: rxrpc: Add connection tracepoint and client conn state tracepoint Add a pair of tracepoints, one to track rxrpc_connection struct ref counting and the other to track the client connection cache state. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 60 ++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 76 +++++++++++++++++++++++++--------------- net/rxrpc/call_accept.c | 4 +++ net/rxrpc/call_object.c | 2 -- net/rxrpc/conn_client.c | 82 ++++++++++++++++++++++++++++++-------------- net/rxrpc/conn_event.c | 2 +- net/rxrpc/conn_object.c | 72 ++++++++++++++++++++++++++++++++++++-- net/rxrpc/conn_service.c | 4 +++ net/rxrpc/misc.c | 31 +++++++++++++++++ 9 files changed, 274 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0a30c673509c..c0c496c83f31 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -16,6 +16,66 @@ #include +TRACE_EVENT(rxrpc_conn, + TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op, + int usage, const void *where), + + TP_ARGS(conn, op, usage, where), + + TP_STRUCT__entry( + __field(struct rxrpc_connection *, conn ) + __field(int, op ) + __field(int, usage ) + __field(const void *, where ) + ), + + TP_fast_assign( + __entry->conn = conn; + __entry->op = op; + __entry->usage = usage; + __entry->where = where; + ), + + TP_printk("C=%p %s u=%d sp=%pSR", + __entry->conn, + rxrpc_conn_traces[__entry->op], + __entry->usage, + __entry->where) + ); + +TRACE_EVENT(rxrpc_client, + TP_PROTO(struct rxrpc_connection *conn, int channel, + enum rxrpc_client_trace op), + + TP_ARGS(conn, channel, op), + + TP_STRUCT__entry( + __field(struct rxrpc_connection *, conn ) + __field(u32, cid ) + __field(int, channel ) + __field(int, usage ) + __field(enum rxrpc_client_trace, op ) + __field(enum rxrpc_conn_cache_state, cs ) + ), + + TP_fast_assign( + __entry->conn = conn; + __entry->channel = channel; + __entry->usage = atomic_read(&conn->usage); + __entry->op = op; + __entry->cid = conn->proto.cid; + __entry->cs = conn->cache_state; + ), + + TP_printk("C=%p h=%2d %s %s i=%08x u=%d", + __entry->conn, + __entry->channel, + rxrpc_client_traces[__entry->op], + rxrpc_conn_cache_states[__entry->cs], + __entry->cid, + __entry->usage) + ); + TRACE_EVENT(rxrpc_call, TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op, int usage, const void *where, const void *aux), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 4a73c20d9436..6ca40eea3022 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -314,6 +314,7 @@ enum rxrpc_conn_cache_state { RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */ RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */ RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */ + RXRPC_CONN__NR_CACHE_STATES }; /* @@ -533,6 +534,44 @@ struct rxrpc_call { rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ }; +enum rxrpc_conn_trace { + rxrpc_conn_new_client, + rxrpc_conn_new_service, + rxrpc_conn_queued, + rxrpc_conn_seen, + rxrpc_conn_got, + rxrpc_conn_put_client, + rxrpc_conn_put_service, + rxrpc_conn__nr_trace +}; + +extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4]; + +enum rxrpc_client_trace { + rxrpc_client_activate_chans, + rxrpc_client_alloc, + rxrpc_client_chan_activate, + rxrpc_client_chan_disconnect, + rxrpc_client_chan_pass, + rxrpc_client_chan_unstarted, + rxrpc_client_cleanup, + rxrpc_client_count, + rxrpc_client_discard, + rxrpc_client_duplicate, + rxrpc_client_exposed, + rxrpc_client_replace, + rxrpc_client_to_active, + rxrpc_client_to_culled, + rxrpc_client_to_idle, + rxrpc_client_to_inactive, + rxrpc_client_to_waiting, + rxrpc_client_uncount, + rxrpc_client__nr_trace +}; + +extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7]; +extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5]; + enum rxrpc_call_trace { rxrpc_call_new_client, rxrpc_call_new_service, @@ -734,7 +773,11 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); -void __rxrpc_put_connection(struct rxrpc_connection *); +bool rxrpc_queue_conn(struct rxrpc_connection *); +void rxrpc_see_connection(struct rxrpc_connection *); +void rxrpc_get_connection(struct rxrpc_connection *); +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); +void rxrpc_put_service_conn(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) @@ -747,38 +790,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) return !rxrpc_conn_is_client(conn); } -static inline void rxrpc_get_connection(struct rxrpc_connection *conn) -{ - atomic_inc(&conn->usage); -} - -static inline -struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) -{ - return atomic_inc_not_zero(&conn->usage) ? conn : NULL; -} - static inline void rxrpc_put_connection(struct rxrpc_connection *conn) { if (!conn) return; - if (rxrpc_conn_is_client(conn)) { - if (atomic_dec_and_test(&conn->usage)) - rxrpc_put_client_conn(conn); - } else { - if (atomic_dec_return(&conn->usage) == 1) - __rxrpc_put_connection(conn); - } -} - -static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) -{ - if (!rxrpc_get_connection_maybe(conn)) - return false; - if (!rxrpc_queue_work(&conn->processor)) - rxrpc_put_connection(conn); - return true; + if (rxrpc_conn_is_client(conn)) + rxrpc_put_client_conn(conn); + else + rxrpc_put_service_conn(conn); } /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 323b8da50163..3e474508ba75 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -85,6 +85,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, b->conn_backlog[head] = conn; smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); + + trace_rxrpc_conn(conn, rxrpc_conn_new_service, + atomic_read(&conn->usage), here); } /* Now it gets complicated, because calls get registered with the @@ -290,6 +293,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_get_local(local); conn->params.local = local; conn->params.peer = peer; + rxrpc_see_connection(conn); rxrpc_new_incoming_connection(conn, skb); } else { rxrpc_get_connection(conn); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 0df9d1af8edb..54f30482a7fd 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -479,8 +479,6 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) struct rxrpc_call, accept_link); list_del(&call->accept_link); rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); - rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 226bc910e556..c76a125df891 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -105,6 +105,14 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *); static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, rxrpc_discard_expired_client_conns); +const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = { + [RXRPC_CONN_CLIENT_INACTIVE] = "Inac", + [RXRPC_CONN_CLIENT_WAITING] = "Wait", + [RXRPC_CONN_CLIENT_ACTIVE] = "Actv", + [RXRPC_CONN_CLIENT_CULLED] = "Cull", + [RXRPC_CONN_CLIENT_IDLE] = "Idle", +}; + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -220,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); + trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + __builtin_return_address(0)); + trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); return conn; @@ -385,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call, rb_replace_node(&conn->client_node, &candidate->client_node, &local->client_conns); + trace_rxrpc_client(conn, -1, rxrpc_client_replace); goto candidate_published; } } @@ -409,8 +421,11 @@ found_extant_conn: _debug("found conn"); spin_unlock(&local->client_conns_lock); - rxrpc_put_connection(candidate); - candidate = NULL; + if (candidate) { + trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); + rxrpc_put_connection(candidate); + candidate = NULL; + } spin_lock(&conn->channel_lock); call->conn = conn; @@ -433,6 +448,7 @@ error: */ static void rxrpc_activate_conn(struct rxrpc_connection *conn) { + trace_rxrpc_client(conn, -1, rxrpc_client_to_active); conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; rxrpc_nr_active_client_conns++; list_move_tail(&conn->cache_link, &rxrpc_active_client_conns); @@ -462,8 +478,10 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) spin_lock(&rxrpc_client_conn_cache_lock); nr_conns = rxrpc_nr_client_conns; - if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) + if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_count); rxrpc_nr_client_conns = nr_conns + 1; + } switch (conn->cache_state) { case RXRPC_CONN_CLIENT_ACTIVE: @@ -494,6 +512,7 @@ activate_conn: wait_for_capacity: _debug("wait"); + trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); conn->cache_state = RXRPC_CONN_CLIENT_WAITING; list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); goto out_unlock; @@ -524,6 +543,8 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, struct rxrpc_call, chan_wait_link); u32 call_id = chan->call_counter + 1; + trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); + write_lock_bh(&call->state_lock); call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; write_unlock_bh(&call->state_lock); @@ -563,6 +584,8 @@ static void rxrpc_activate_channels(struct rxrpc_connection *conn) _enter("%d", conn->debug_id); + trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans); + if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE || conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) return; @@ -657,10 +680,13 @@ int rxrpc_connect_call(struct rxrpc_call *call, * had a chance at re-use (the per-connection security negotiation is * expensive). */ -static void rxrpc_expose_client_conn(struct rxrpc_connection *conn) +static void rxrpc_expose_client_conn(struct rxrpc_connection *conn, + unsigned int channel) { - if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) + if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { + trace_rxrpc_client(conn, channel, rxrpc_client_exposed); rxrpc_get_connection(conn); + } } /* @@ -669,9 +695,9 @@ static void rxrpc_expose_client_conn(struct rxrpc_connection *conn) */ void rxrpc_expose_client_call(struct rxrpc_call *call) { + unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; - struct rxrpc_channel *chan = - &conn->channels[call->cid & RXRPC_CHANNELMASK]; + struct rxrpc_channel *chan = &conn->channels[channel]; if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) { /* Mark the call ID as being used. If the callNumber counter @@ -682,7 +708,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) chan->call_counter++; if (chan->call_counter >= INT_MAX) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_expose_client_conn(conn); + rxrpc_expose_client_conn(conn, channel); } } @@ -695,6 +721,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; + trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); call->conn = NULL; spin_lock(&conn->channel_lock); @@ -709,6 +736,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); list_del_init(&call->chan_wait_link); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted); + /* We must deactivate or idle the connection if it's now * waiting for nothing. */ @@ -739,7 +768,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) /* See if we can pass the channel directly to another call. */ if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE && !list_empty(&conn->waiting_calls)) { - _debug("pass chan"); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); goto out_2; } @@ -762,7 +791,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) goto out; } - _debug("pass chan 2"); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); goto out; @@ -794,7 +823,7 @@ idle_connection: * immediately or moved to the idle list for a short while. */ if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { - _debug("make idle"); + trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); conn->idle_timestamp = jiffies; conn->cache_state = RXRPC_CONN_CLIENT_IDLE; list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns); @@ -804,7 +833,7 @@ idle_connection: &rxrpc_client_conn_reap, rxrpc_conn_idle_client_expiry); } else { - _debug("make inactive"); + trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; list_del_init(&conn->cache_link); } @@ -821,6 +850,8 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) struct rxrpc_local *local = conn->params.local; unsigned int nr_conns; + trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); + if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) { spin_lock(&local->client_conns_lock); if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, @@ -834,6 +865,7 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE); if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_uncount); spin_lock(&rxrpc_client_conn_cache_lock); nr_conns = --rxrpc_nr_client_conns; @@ -863,20 +895,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) */ void rxrpc_put_client_conn(struct rxrpc_connection *conn) { - struct rxrpc_connection *next; + const void *here = __builtin_return_address(0); + int n; do { - _enter("%p{u=%d,d=%d}", - conn, atomic_read(&conn->usage), conn->debug_id); - - next = rxrpc_put_one_client_conn(conn); - - if (!next) - break; - conn = next; - } while (atomic_dec_and_test(&conn->usage)); - - _leave(""); + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + if (n > 0) + return; + ASSERTCMP(n, >=, 0); + + conn = rxrpc_put_one_client_conn(conn); + } while (conn); } /* @@ -907,9 +937,11 @@ static void rxrpc_cull_active_client_conns(void) ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE); if (list_empty(&conn->waiting_calls)) { + trace_rxrpc_client(conn, -1, rxrpc_client_to_culled); conn->cache_state = RXRPC_CONN_CLIENT_CULLED; list_del_init(&conn->cache_link); } else { + trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); conn->cache_state = RXRPC_CONN_CLIENT_WAITING; list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); @@ -983,7 +1015,7 @@ next: goto not_yet_expired; } - _debug("discard conn %d", conn->debug_id); + trace_rxrpc_client(conn, -1, rxrpc_client_discard); if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) BUG(); conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 0691007cfc02..a43f4c94a88d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -377,7 +377,7 @@ void rxrpc_process_connection(struct work_struct *work) u32 abort_code = RX_PROTOCOL_ERROR; int ret; - _enter("{%d}", conn->debug_id); + rxrpc_see_connection(conn); if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index bb1f29280aea..3b55aee0c436 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -246,11 +246,77 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn) } /* - * release a virtual connection + * Queue a connection's work processor, getting a ref to pass to the work + * queue. */ -void __rxrpc_put_connection(struct rxrpc_connection *conn) +bool rxrpc_queue_conn(struct rxrpc_connection *conn) { - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + const void *here = __builtin_return_address(0); + int n = __atomic_add_unless(&conn->usage, 1, 0); + if (n == 0) + return false; + if (rxrpc_queue_work(&conn->processor)) + trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + else + rxrpc_put_connection(conn); + return true; +} + +/* + * Note the re-emergence of a connection. + */ +void rxrpc_see_connection(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + if (conn) { + int n = atomic_read(&conn->usage); + + trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + } +} + +/* + * Get a ref on a connection. + */ +void rxrpc_get_connection(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&conn->usage); + + trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); +} + +/* + * Try to get a ref on a connection. + */ +struct rxrpc_connection * +rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + + if (conn) { + int n = __atomic_add_unless(&conn->usage, 1, 0); + if (n > 0) + trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + else + conn = NULL; + } + return conn; +} + +/* + * Release a service connection + */ +void rxrpc_put_service_conn(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + ASSERTCMP(n, >=, 0); + if (n == 0) + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); } /* diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 83d54da4ce8b..eef551f40dc2 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -136,6 +136,10 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) list_add_tail(&conn->link, &rxrpc_connections); list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); write_unlock(&rxrpc_connection_lock); + + trace_rxrpc_conn(conn, rxrpc_conn_new_service, + atomic_read(&conn->usage), + __builtin_return_address(0)); } return conn; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 8b910780f1ac..598064d3bdd2 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -101,3 +101,34 @@ const char *rxrpc_acks(u8 reason) reason = ARRAY_SIZE(str) - 1; return str[reason]; } + +const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { + [rxrpc_conn_new_client] = "NWc", + [rxrpc_conn_new_service] = "NWs", + [rxrpc_conn_queued] = "QUE", + [rxrpc_conn_seen] = "SEE", + [rxrpc_conn_got] = "GOT", + [rxrpc_conn_put_client] = "PTc", + [rxrpc_conn_put_service] = "PTs", +}; + +const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { + [rxrpc_client_activate_chans] = "Activa", + [rxrpc_client_alloc] = "Alloc ", + [rxrpc_client_chan_activate] = "ChActv", + [rxrpc_client_chan_disconnect] = "ChDisc", + [rxrpc_client_chan_pass] = "ChPass", + [rxrpc_client_chan_unstarted] = "ChUnst", + [rxrpc_client_cleanup] = "Clean ", + [rxrpc_client_count] = "Count ", + [rxrpc_client_discard] = "Discar", + [rxrpc_client_duplicate] = "Duplic", + [rxrpc_client_exposed] = "Expose", + [rxrpc_client_replace] = "Replac", + [rxrpc_client_to_active] = "->Actv", + [rxrpc_client_to_culled] = "->Cull", + [rxrpc_client_to_idle] = "->Idle", + [rxrpc_client_to_inactive] = "->Inac", + [rxrpc_client_to_waiting] = "->Wait", + [rxrpc_client_uncount] = "Uncoun", +}; -- cgit v1.2.3 From a124fe3ee5d82f2c9a9b8818ed5cb9f61685f1d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to follow the life of a packet in the Tx buffer Add a tracepoint to follow the insertion of a packet into the transmit buffer, its transmission and its rotation out of the buffer. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 26 ++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 12 ++++++++++++ net/rxrpc/input.c | 2 ++ net/rxrpc/misc.c | 9 +++++++++ net/rxrpc/sendmsg.c | 9 ++++++++- 5 files changed, 57 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c0c496c83f31..ffc74b3e5b76 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -208,6 +208,32 @@ TRACE_EVENT(rxrpc_abort, __entry->abort_code, __entry->error, __entry->why) ); +TRACE_EVENT(rxrpc_transmit, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_transmit_trace why), + + TP_ARGS(call, why), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_transmit_trace, why ) + __field(rxrpc_seq_t, tx_hard_ack ) + __field(rxrpc_seq_t, tx_top ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->tx_hard_ack = call->tx_hard_ack; + __entry->tx_top = call->tx_top; + ), + + TP_printk("c=%p %s f=%08x n=%u", + __entry->call, + rxrpc_transmit_traces[__entry->why], + __entry->tx_hard_ack + 1, + __entry->tx_top - __entry->tx_hard_ack) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6ca40eea3022..afa5dcc05fe0 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -593,6 +593,18 @@ enum rxrpc_call_trace { extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; +enum rxrpc_transmit_trace { + rxrpc_transmit_wait, + rxrpc_transmit_queue, + rxrpc_transmit_queue_reqack, + rxrpc_transmit_queue_last, + rxrpc_transmit_rotate, + rxrpc_transmit_end, + rxrpc_transmit__nr_trace +}; + +extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c1f83d22f9b7..c7eb5104e91a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) spin_unlock(&call->lock); + trace_rxrpc_transmit(call, rxrpc_transmit_rotate); wake_up(&call->waitq); while (list) { @@ -107,6 +108,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why) } write_unlock(&call->state_lock); + trace_rxrpc_transmit(call, rxrpc_transmit_end); _leave(" = ok"); return true; } diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 598064d3bdd2..dca89995f03e 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -132,3 +132,12 @@ const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { [rxrpc_client_to_waiting] = "->Wait", [rxrpc_client_uncount] = "Uncoun", }; + +const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { + [rxrpc_transmit_wait] = "WAI", + [rxrpc_transmit_queue] = "QUE", + [rxrpc_transmit_queue_reqack] = "QRA", + [rxrpc_transmit_queue_last] = "QLS", + [rxrpc_transmit_rotate] = "ROT", + [rxrpc_transmit_end] = "END", +}; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 8bfddf4e338c..28d8f73cf11d 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -56,6 +56,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, break; } + trace_rxrpc_transmit(call, rxrpc_transmit_wait); release_sock(&rx->sk); *timeo = schedule_timeout(*timeo); lock_sock(&rx->sk); @@ -104,8 +105,14 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, smp_wmb(); call->rxtx_buffer[ix] = skb; call->tx_top = seq; - if (last) + if (last) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); + trace_rxrpc_transmit(call, rxrpc_transmit_queue_last); + } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + trace_rxrpc_transmit(call, rxrpc_transmit_queue_reqack); + } else { + trace_rxrpc_transmit(call, rxrpc_transmit_queue); + } if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); -- cgit v1.2.3 From ec71eb9ada34f8d1a58b7c35d906c59411295445 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to log received ACK packets Add a tracepoint to log information from received ACK packets. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 26 ++++++++++++++++++++++++++ net/rxrpc/input.c | 2 ++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ffc74b3e5b76..2b19f3fa5174 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -234,6 +234,32 @@ TRACE_EVENT(rxrpc_transmit, __entry->tx_top - __entry->tx_hard_ack) ); +TRACE_EVENT(rxrpc_rx_ack, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first, u8 reason, u8 n_acks), + + TP_ARGS(call, first, reason, n_acks), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_seq_t, first ) + __field(u8, reason ) + __field(u8, n_acks ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->first = first; + __entry->reason = reason; + __entry->n_acks = n_acks; + ), + + TP_printk("c=%p %s f=%08x n=%u", + __entry->call, + rxrpc_acks(__entry->reason), + __entry->first, + __entry->n_acks) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c7eb5104e91a..7b18ca124978 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -440,6 +440,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; + trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks); + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", sp->hdr.serial, ntohs(buf.ack.maxSkew), -- cgit v1.2.3 From f3639df2d90bc919328c459b3c7c49ed5667a52f Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to log ACK transmission Add a tracepoint to log information about ACK transmission. Signed-off-by: David Howels --- include/trace/events/rxrpc.h | 30 ++++++++++++++++++++++++++++++ net/rxrpc/conn_event.c | 3 +++ net/rxrpc/output.c | 7 ++++++- 3 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 2b19f3fa5174..d545d692ae22 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -260,6 +260,36 @@ TRACE_EVENT(rxrpc_rx_ack, __entry->n_acks) ); +TRACE_EVENT(rxrpc_tx_ack, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first, + rxrpc_serial_t serial, u8 reason, u8 n_acks), + + TP_ARGS(call, first, serial, reason, n_acks), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_seq_t, first ) + __field(rxrpc_serial_t, serial ) + __field(u8, reason ) + __field(u8, n_acks ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->first = first; + __entry->serial = serial; + __entry->reason = reason; + __entry->n_acks = n_acks; + ), + + TP_printk("c=%p %s f=%08x r=%08x n=%u", + __entry->call, + rxrpc_acks(__entry->reason), + __entry->first, + __entry->serial, + __entry->n_acks) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a43f4c94a88d..9b19c51831aa 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -98,6 +98,9 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.info.rwind = htonl(rxrpc_rx_window_size); pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); len += sizeof(pkt.ack) + sizeof(pkt.info); + + trace_rxrpc_tx_ack(NULL, chan->last_seq, 0, + RXRPC_ACK_DUPLICATE, 0); break; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 0b21ed859de7..2c9daeadce87 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -38,12 +38,14 @@ struct rxrpc_pkt_buffer { static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, struct rxrpc_pkt_buffer *pkt) { + rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; int ix; u32 mtu, jmax; u8 *ackp = pkt->acks; /* Barrier against rxrpc_input_data(). */ + serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); @@ -51,7 +53,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ack.maxSkew = htons(call->ackr_skew); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); - pkt->ack.serial = htonl(call->ackr_serial); + pkt->ack.serial = htonl(serial); pkt->ack.reason = call->ackr_reason; pkt->ack.nAcks = top - hard_ack; @@ -75,6 +77,9 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ackinfo.rwind = htonl(call->rx_winsize); pkt->ackinfo.jumbo_max = htonl(jmax); + trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason, + top - hard_ack); + *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; -- cgit v1.2.3 From 58dc63c998ea3c5a27e2bf9251eddbf0977056a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to follow packets in the Rx buffer Add a tracepoint to follow the life of packets that get added to a call's receive buffer. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 33 +++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 12 ++++++++++++ net/rxrpc/call_accept.c | 3 +++ net/rxrpc/input.c | 6 +++++- net/rxrpc/misc.c | 9 +++++++++ net/rxrpc/recvmsg.c | 11 +++++++++++ 6 files changed, 73 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d545d692ae22..7dd5f0188681 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -290,6 +290,39 @@ TRACE_EVENT(rxrpc_tx_ack, __entry->n_acks) ); +TRACE_EVENT(rxrpc_receive, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_receive_trace why, + rxrpc_serial_t serial, rxrpc_seq_t seq), + + TP_ARGS(call, why, serial, seq), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_receive_trace, why ) + __field(rxrpc_serial_t, serial ) + __field(rxrpc_seq_t, seq ) + __field(rxrpc_seq_t, hard_ack ) + __field(rxrpc_seq_t, top ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->serial = serial; + __entry->seq = seq; + __entry->hard_ack = call->rx_hard_ack; + __entry->top = call->rx_top; + ), + + TP_printk("c=%p %s r=%08x q=%08x w=%08x-%08x", + __entry->call, + rxrpc_receive_traces[__entry->why], + __entry->serial, + __entry->seq, + __entry->hard_ack, + __entry->top) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index afa5dcc05fe0..e5d2f2fb8e41 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -605,6 +605,18 @@ enum rxrpc_transmit_trace { extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; +enum rxrpc_receive_trace { + rxrpc_receive_incoming, + rxrpc_receive_queue, + rxrpc_receive_queue_last, + rxrpc_receive_front, + rxrpc_receive_rotate, + rxrpc_receive_end, + rxrpc_receive__nr_trace +}; + +extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3e474508ba75..a8d39d7cf42c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -367,6 +367,9 @@ found_service: goto out; } + trace_rxrpc_receive(call, rxrpc_receive_incoming, + sp->hdr.serial, sp->hdr.seq); + /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7b18ca124978..b690220533c6 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -284,8 +284,12 @@ next_subpacket: call->rxtx_buffer[ix] = skb; if (after(seq, call->rx_top)) smp_store_release(&call->rx_top, seq); - if (flags & RXRPC_LAST_PACKET) + if (flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_RX_LAST, &call->flags); + trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); + } else { + trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); + } queued = true; if (after_eq(seq, call->rx_expect_next)) { diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index dca89995f03e..db5f1d54fc90 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -141,3 +141,12 @@ const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { [rxrpc_transmit_rotate] = "ROT", [rxrpc_transmit_end] = "END", }; + +const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { + [rxrpc_receive_incoming] = "INC", + [rxrpc_receive_queue] = "QUE", + [rxrpc_receive_queue_last] = "QLS", + [rxrpc_receive_front] = "FRN", + [rxrpc_receive_rotate] = "ROT", + [rxrpc_receive_end] = "END", +}; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8b8d7e14f800..22d51087c580 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -134,6 +134,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) { _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); + trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { @@ -167,6 +168,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; + rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; u8 flags; int ix; @@ -183,6 +185,10 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_see_skb(skb); sp = rxrpc_skb(skb); flags = sp->hdr.flags; + serial = sp->hdr.serial; + if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) + serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1; + call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; /* Barrier against rxrpc_input_data(). */ @@ -191,6 +197,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_free_skb(skb); _debug("%u,%u,%02x", hard_ack, top, flags); + trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); if (flags & RXRPC_LAST_PACKET) rxrpc_end_rx_phase(call); } @@ -309,6 +316,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rxrpc_see_skb(skb); sp = rxrpc_skb(skb); + if (!(flags & MSG_PEEK)) + trace_rxrpc_receive(call, rxrpc_receive_front, + sp->hdr.serial, seq); + if (msg) sock_recv_timestamp(msg, sock->sk, skb); -- cgit v1.2.3 From 849979051cbc9352857d8bb31895ae55afe19d96 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 11:13:31 +0100 Subject: rxrpc: Add a tracepoint to follow what recvmsg does Add a tracepoint to follow what recvmsg does within AF_RXRPC. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 34 ++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 17 +++++++++++++++++ net/rxrpc/misc.c | 14 ++++++++++++++ net/rxrpc/recvmsg.c | 34 ++++++++++++++++++++++++++-------- 4 files changed, 91 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 7dd5f0188681..58732202e9f0 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -323,6 +323,40 @@ TRACE_EVENT(rxrpc_receive, __entry->top) ); +TRACE_EVENT(rxrpc_recvmsg, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, + rxrpc_seq_t seq, unsigned int offset, unsigned int len, + int ret), + + TP_ARGS(call, why, seq, offset, len, ret), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_recvmsg_trace, why ) + __field(rxrpc_seq_t, seq ) + __field(unsigned int, offset ) + __field(unsigned int, len ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->seq = seq; + __entry->offset = offset; + __entry->len = len; + __entry->ret = ret; + ), + + TP_printk("c=%p %s q=%08x o=%u l=%u ret=%d", + __entry->call, + rxrpc_recvmsg_traces[__entry->why], + __entry->seq, + __entry->offset, + __entry->len, + __entry->ret) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e5d2f2fb8e41..a17341d2df3d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -617,6 +617,23 @@ enum rxrpc_receive_trace { extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; +enum rxrpc_recvmsg_trace { + rxrpc_recvmsg_enter, + rxrpc_recvmsg_wait, + rxrpc_recvmsg_dequeue, + rxrpc_recvmsg_hole, + rxrpc_recvmsg_next, + rxrpc_recvmsg_cont, + rxrpc_recvmsg_full, + rxrpc_recvmsg_data_return, + rxrpc_recvmsg_terminal, + rxrpc_recvmsg_to_be_accepted, + rxrpc_recvmsg_return, + rxrpc_recvmsg__nr_trace +}; + +extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index db5f1d54fc90..c7065d893d1e 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -150,3 +150,17 @@ const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { [rxrpc_receive_rotate] = "ROT", [rxrpc_receive_end] = "END", }; + +const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { + [rxrpc_recvmsg_enter] = "ENTR", + [rxrpc_recvmsg_wait] = "WAIT", + [rxrpc_recvmsg_dequeue] = "DEQU", + [rxrpc_recvmsg_hole] = "HOLE", + [rxrpc_recvmsg_next] = "NEXT", + [rxrpc_recvmsg_cont] = "CONT", + [rxrpc_recvmsg_full] = "FULL", + [rxrpc_recvmsg_data_return] = "DATA", + [rxrpc_recvmsg_terminal] = "TERM", + [rxrpc_recvmsg_to_be_accepted] = "TBAC", + [rxrpc_recvmsg_return] = "RETN", +}; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 22d51087c580..b62a08151895 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -94,6 +94,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) break; } + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, + call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } @@ -124,6 +126,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); } + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret); return ret; } @@ -310,8 +313,11 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, for (seq = hard_ack + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - if (!skb) + if (!skb) { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq, + rx_pkt_offset, rx_pkt_len, 0); break; + } smp_rmb(); rxrpc_see_skb(skb); sp = rxrpc_skb(skb); @@ -327,10 +333,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ret2 = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], &rx_pkt_offset, &rx_pkt_len); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, + rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { ret = ret2; goto out; } + } else { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq, + rx_pkt_offset, rx_pkt_len, 0); } _debug("recvmsg %x DATA #%u { %d, %d }", sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); @@ -357,6 +368,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } if (rx_pkt_len > 0) { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq, + rx_pkt_offset, rx_pkt_len, 0); _debug("buffer full"); ASSERTCMP(*_offset, ==, len); ret = 0; @@ -383,6 +396,8 @@ out: call->rx_pkt_len = rx_pkt_len; } done: + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, + rx_pkt_offset, rx_pkt_len, ret); _leave(" = %d [%u/%u]", ret, seq, top); return ret; } @@ -404,7 +419,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, DEFINE_WAIT(wait); - _enter(",,,%zu,%d", len, flags); + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; @@ -424,8 +439,10 @@ try_again: if (list_empty(&rx->recvmsg_q)) { ret = -EWOULDBLOCK; - if (timeo == 0) + if (timeo == 0) { + call = NULL; goto error_no_call; + } release_sock(&rx->sk); @@ -439,6 +456,8 @@ try_again: if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, + 0, 0, 0, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); @@ -457,7 +476,7 @@ try_again: rxrpc_get_call(call, rxrpc_call_got); write_unlock_bh(&rx->recvmsg_lock); - _debug("recvmsg call %p", call); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); @@ -527,16 +546,15 @@ error: rxrpc_put_call(call, rxrpc_call_put); error_no_call: release_sock(&rx->sk); - _leave(" = %d", ret); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); return ret; wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); - release_sock(&rx->sk); - _leave(" = %d [wait]", ret); - return ret; + call = NULL; + goto error_no_call; } /** -- cgit v1.2.3 From 71f3ca408fd43b586c02480768a503af075b247e Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:14 +0100 Subject: rxrpc: Improve skb tracing Improve sk_buff tracing within AF_RXRPC by the following means: (1) Use an enum to note the event type rather than plain integers and use an array of event names rather than a big multi ?: list. (2) Distinguish Rx from Tx packets and account them separately. This requires the call phase to be tracked so that we know what we might find in rxtx_buffer[]. (3) Add a parameter to rxrpc_{new,see,get,free}_skb() to indicate the event type. (4) A pair of 'rotate' events are added to indicate packets that are about to be rotated out of the Rx and Tx windows. (5) A pair of 'lost' events are added, along with rxrpc_lose_skb() for packet loss injection recording. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 12 ++++------ net/rxrpc/af_rxrpc.c | 5 +++-- net/rxrpc/ar-internal.h | 33 ++++++++++++++++++++++----- net/rxrpc/call_event.c | 8 +++---- net/rxrpc/call_object.c | 11 ++++++--- net/rxrpc/conn_event.c | 6 ++--- net/rxrpc/input.c | 13 ++++++----- net/rxrpc/local_event.c | 4 ++-- net/rxrpc/misc.c | 18 +++++++++++++++ net/rxrpc/output.c | 4 ++-- net/rxrpc/peer_event.c | 10 ++++----- net/rxrpc/recvmsg.c | 7 +++--- net/rxrpc/sendmsg.c | 10 ++++----- net/rxrpc/skbuff.c | 53 +++++++++++++++++++++++++++++++------------- 14 files changed, 131 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 58732202e9f0..75a5d8bf50e1 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -107,14 +107,14 @@ TRACE_EVENT(rxrpc_call, ); TRACE_EVENT(rxrpc_skb, - TP_PROTO(struct sk_buff *skb, int op, int usage, int mod_count, - const void *where), + TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op, + int usage, int mod_count, const void *where), TP_ARGS(skb, op, usage, mod_count, where), TP_STRUCT__entry( __field(struct sk_buff *, skb ) - __field(int, op ) + __field(enum rxrpc_skb_trace, op ) __field(int, usage ) __field(int, mod_count ) __field(const void *, where ) @@ -130,11 +130,7 @@ TRACE_EVENT(rxrpc_skb, TP_printk("s=%p %s u=%d m=%d p=%pSR", __entry->skb, - (__entry->op == 0 ? "NEW" : - __entry->op == 1 ? "SEE" : - __entry->op == 2 ? "GET" : - __entry->op == 3 ? "FRE" : - "PUR"), + rxrpc_skb_traces[__entry->op], __entry->usage, __entry->mod_count, __entry->where) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 09f81befc705..8dbf7bed2cc4 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -45,7 +45,7 @@ u32 rxrpc_epoch; atomic_t rxrpc_debug_id; /* count of skbs currently in use */ -atomic_t rxrpc_n_skbs; +atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; struct workqueue_struct *rxrpc_workqueue; @@ -867,7 +867,8 @@ static void __exit af_rxrpc_exit(void) proto_unregister(&rxrpc_proto); rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); - ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0); + ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); rxrpc_destroy_all_locals(); remove_proc_entry("rxrpc_conns", init_net.proc_net); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a17341d2df3d..034f525f2235 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -520,6 +520,7 @@ struct rxrpc_call { rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ + bool tx_phase; /* T if transmission phase, F if receive phase */ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ /* receive-phase ACK management */ @@ -534,6 +535,27 @@ struct rxrpc_call { rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ }; +enum rxrpc_skb_trace { + rxrpc_skb_rx_cleaned, + rxrpc_skb_rx_freed, + rxrpc_skb_rx_got, + rxrpc_skb_rx_lost, + rxrpc_skb_rx_received, + rxrpc_skb_rx_rotated, + rxrpc_skb_rx_purged, + rxrpc_skb_rx_seen, + rxrpc_skb_tx_cleaned, + rxrpc_skb_tx_freed, + rxrpc_skb_tx_got, + rxrpc_skb_tx_lost, + rxrpc_skb_tx_new, + rxrpc_skb_tx_rotated, + rxrpc_skb_tx_seen, + rxrpc_skb__nr_trace +}; + +extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7]; + enum rxrpc_conn_trace { rxrpc_conn_new_client, rxrpc_conn_new_service, @@ -642,7 +664,7 @@ extern const char *rxrpc_acks(u8 reason); /* * af_rxrpc.c */ -extern atomic_t rxrpc_n_skbs; +extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; @@ -1000,10 +1022,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_packet_destructor(struct sk_buff *); -void rxrpc_new_skb(struct sk_buff *); -void rxrpc_see_skb(struct sk_buff *); -void rxrpc_get_skb(struct sk_buff *); -void rxrpc_free_skb(struct sk_buff *); +void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index f0cabc48a1b7..7d1b99824ed9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -170,7 +170,7 @@ static void rxrpc_resend(struct rxrpc_call *call) continue; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_tx_seen); sp = rxrpc_skb(skb); if (annotation == RXRPC_TX_ANNO_UNACK) { @@ -199,7 +199,7 @@ static void rxrpc_resend(struct rxrpc_call *call) continue; skb = call->rxtx_buffer[ix]; - rxrpc_get_skb(skb); + rxrpc_get_skb(skb, rxrpc_skb_tx_got); spin_unlock_bh(&call->lock); sp = rxrpc_skb(skb); @@ -211,7 +211,7 @@ static void rxrpc_resend(struct rxrpc_call *call) if (rxrpc_send_data_packet(call->conn, skb) < 0) { call->resend_at = now + 2; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); return; } @@ -219,7 +219,7 @@ static void rxrpc_resend(struct rxrpc_call *call) rxrpc_expose_client_call(call); sp->resend_at = now + rxrpc_resend_timeout; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); spin_lock_bh(&call->lock); /* We need to clear the retransmit state, but there are two diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 54f30482a7fd..f50a6094e198 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -182,6 +182,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->service_id = srx->srx_service; + call->tx_phase = true; _leave(" = %p", call); return call; @@ -458,7 +459,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_disconnect_call(call); for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { - rxrpc_free_skb(call->rxtx_buffer[i]); + rxrpc_free_skb(call->rxtx_buffer[i], + (call->tx_phase ? rxrpc_skb_tx_cleaned : + rxrpc_skb_rx_cleaned)); call->rxtx_buffer[i] = NULL; } @@ -552,9 +555,11 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) /* Clean up the Rx/Tx buffer */ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) - rxrpc_free_skb(call->rxtx_buffer[i]); + rxrpc_free_skb(call->rxtx_buffer[i], + (call->tx_phase ? rxrpc_skb_tx_cleaned : + rxrpc_skb_rx_cleaned)); - rxrpc_free_skb(call->tx_pending); + rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9b19c51831aa..75a15a4c74c3 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -388,7 +388,7 @@ void rxrpc_process_connection(struct work_struct *work) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -399,7 +399,7 @@ void rxrpc_process_connection(struct work_struct *work) goto requeue_and_leave; case -ECONNABORTED: default: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); break; } } @@ -416,7 +416,7 @@ requeue_and_leave: protocol_error: if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) goto requeue_and_leave; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _leave(" [EPROTO]"); goto out; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b690220533c6..84bb16d47b85 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -50,7 +50,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) call->tx_hard_ack++; ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_tx_rotated); call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; skb->next = list; @@ -66,7 +66,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) skb = list; list = skb->next; skb->next = NULL; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); } } @@ -99,6 +99,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why) default: break; case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->tx_phase = false; call->state = RXRPC_CALL_CLIENT_RECV_REPLY; break; case RXRPC_CALL_SERVER_AWAIT_ACK: @@ -278,7 +279,7 @@ next_subpacket: * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() * and also rxrpc_fill_out_ack(). */ - rxrpc_get_skb(skb); + rxrpc_get_skb(skb, rxrpc_skb_rx_got); call->rxtx_annotations[ix] = annotation; smp_wmb(); call->rxtx_buffer[ix] = skb; @@ -691,13 +692,13 @@ void rxrpc_data_ready(struct sock *udp_sk) return; } - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); _net("recv skb %p", skb); /* we'll probably need to checksum it (didn't call sock_recvmsg) */ if (skb_checksum_complete(skb)) { - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; @@ -821,7 +822,7 @@ void rxrpc_data_ready(struct sock *udp_sk) discard_unlock: rcu_read_unlock(); discard: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); out: trace_rxrpc_rx_done(0, 0); return; diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index f073e932500e..190f68bd9e27 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -90,7 +90,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { @@ -107,7 +107,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) break; } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); } _leave(""); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index c7065d893d1e..026e1f2e83ff 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -102,6 +102,24 @@ const char *rxrpc_acks(u8 reason) return str[reason]; } +const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { + [rxrpc_skb_rx_cleaned] = "Rx CLN", + [rxrpc_skb_rx_freed] = "Rx FRE", + [rxrpc_skb_rx_got] = "Rx GOT", + [rxrpc_skb_rx_lost] = "Rx *L*", + [rxrpc_skb_rx_received] = "Rx RCV", + [rxrpc_skb_rx_purged] = "Rx PUR", + [rxrpc_skb_rx_rotated] = "Rx ROT", + [rxrpc_skb_rx_seen] = "Rx SEE", + [rxrpc_skb_tx_cleaned] = "Tx CLN", + [rxrpc_skb_tx_freed] = "Tx FRE", + [rxrpc_skb_tx_got] = "Tx GOT", + [rxrpc_skb_tx_lost] = "Tx *L*", + [rxrpc_skb_tx_new] = "Tx NEW", + [rxrpc_skb_tx_rotated] = "Tx ROT", + [rxrpc_skb_tx_seen] = "Tx SEE", +}; + const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { [rxrpc_conn_new_client] = "NWc", [rxrpc_conn_new_service] = "NWs", diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2c9daeadce87..a2cad5ce7416 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -324,7 +324,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { @@ -343,7 +343,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) kernel_sendmsg(local->socket, &msg, iov, 2, size); } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); } _leave(""); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 9e0725f5652b..18276e7cb9e0 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -155,11 +155,11 @@ void rxrpc_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); return; } @@ -169,7 +169,7 @@ void rxrpc_error_report(struct sock *sk) peer = NULL; if (!peer) { rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _leave(" [no peer]"); return; } @@ -179,7 +179,7 @@ void rxrpc_error_report(struct sock *sk) serr->ee.ee_code == ICMP_FRAG_NEEDED)) { rxrpc_adjust_mtu(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); rxrpc_put_peer(peer); _leave(" [MTU update]"); return; @@ -187,7 +187,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); /* The ref we obtained is passed off to the work item */ rxrpc_queue_work(&peer->error_distributor); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 79e65668bc58..6ba4af5a8d95 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -155,6 +155,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) break; case RXRPC_CALL_SERVER_RECV_REQUEST: + call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; break; default: @@ -185,7 +186,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) hard_ack++; ix = hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_rotated); sp = rxrpc_skb(skb); flags = sp->hdr.flags; serial = sp->hdr.serial; @@ -197,7 +198,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) /* Barrier against rxrpc_input_data(). */ smp_store_release(&call->rx_hard_ack, hard_ack); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _debug("%u,%u,%02x", hard_ack, top, flags); trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); @@ -317,7 +318,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, break; } smp_rmb(); - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); if (!(flags & MSG_PEEK)) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 28d8f73cf11d..6a39ee97a0b7 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -100,7 +100,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, ASSERTCMP(seq, ==, call->tx_top + 1); ix = seq & RXRPC_RXTX_BUFF_MASK; - rxrpc_get_skb(skb); + rxrpc_get_skb(skb, rxrpc_skb_tx_got); call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; smp_wmb(); call->rxtx_buffer[ix] = skb; @@ -146,7 +146,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_instant_resend(call, ix); } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); _leave(""); } @@ -201,7 +201,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, skb = call->tx_pending; call->tx_pending = NULL; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_tx_seen); copied = 0; do { @@ -242,7 +242,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (!skb) goto maybe_error; - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_tx_new); _debug("ALLOC SEND %p", skb); @@ -352,7 +352,7 @@ out: return ret; call_terminated: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); _leave(" = %d", -call->error); return -call->error; diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 620d9ccaf3c1..5154cbf7e540 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -18,54 +18,76 @@ #include #include "ar-internal.h" +#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) + /* - * Note the existence of a new-to-us socket buffer (allocated or dequeued). + * Note the allocation or reception of a socket buffer. */ -void rxrpc_new_skb(struct sk_buff *skb) +void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); - int n = atomic_inc_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here); + int n = atomic_inc_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ -void rxrpc_see_skb(struct sk_buff *skb) +void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { - int n = atomic_read(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here); + int n = atomic_read(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); } } /* * Note the addition of a ref on a socket buffer. */ -void rxrpc_get_skb(struct sk_buff *skb) +void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); - int n = atomic_inc_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here); + int n = atomic_inc_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); skb_get(skb); } /* * Note the destruction of a socket buffer. */ -void rxrpc_free_skb(struct sk_buff *skb) +void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { int n; CHECK_SLAB_OKAY(&skb->users); - n = atomic_dec_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here); + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); kfree_skb(skb); } } +/* + * Note the injected loss of a socket buffer. + */ +void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n; + CHECK_SLAB_OKAY(&skb->users); + if (op == rxrpc_skb_tx_lost) { + n = atomic_read(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + } else { + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } + } +} + /* * Clear a queue of socket buffers. */ @@ -74,8 +96,9 @@ void rxrpc_purge_queue(struct sk_buff_head *list) const void *here = __builtin_return_address(0); struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { - int n = atomic_dec_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here); + int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged)); + trace_rxrpc_skb(skb, rxrpc_skb_rx_purged, + atomic_read(&skb->users), n, here); kfree_skb(skb); } } -- cgit v1.2.3 From 19664c6a000956290cce84c6924b13488ab794d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 15 Sep 2016 10:18:45 -0700 Subject: net: l3mdev: Remove netif_index_is_l3_master No longer used after e0d56fdd73422 ("net: l3mdev: remove redundant calls") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 3832099289c5..b220dabeab45 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -114,25 +114,6 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev) return tb_id; } -static inline bool netif_index_is_l3_master(struct net *net, int ifindex) -{ - struct net_device *dev; - bool rc = false; - - if (ifindex == 0) - return false; - - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, ifindex); - if (dev) - rc = netif_is_l3_master(dev); - - rcu_read_unlock(); - - return rc; -} - struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); static inline @@ -226,11 +207,6 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) return 0; } -static inline bool netif_index_is_l3_master(struct net *net, int ifindex) -{ - return false; -} - static inline struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { -- cgit v1.2.3 From cfc7381b3002756b1dcada32979e942aa3126e31 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 15 Sep 2016 13:00:29 -0700 Subject: ip_tunnel: add collect_md mode to IPIP tunnel Similar to gre, vxlan, geneve tunnels allow IPIP tunnels to operate in 'collect metadata' mode. bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away. ovs can use it as well in the future (once appropriate ovs-vport abstractions and user apis are added). Note that just like in other tunnels we cannot cache the dst, since tunnel_info metadata can be different for every packet. Signed-off-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 ++ include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_tunnel.c | 76 ++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ipip.c | 35 +++++++++++++++---- 4 files changed, 108 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e598c639aa6f..59557c07904b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -255,6 +255,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); +void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + const u8 proto); int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 9865c8caedde..18d5dc13985d 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -73,6 +73,7 @@ enum { IFLA_IPTUN_ENCAP_FLAGS, IFLA_IPTUN_ENCAP_SPORT, IFLA_IPTUN_ENCAP_DPORT, + IFLA_IPTUN_COLLECT_METADATA, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 95649ebd2874..5719d6ba0824 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -55,6 +55,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } +void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + u32 headroom = sizeof(struct iphdr); + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + const struct iphdr *inner_iph; + struct rtable *rt; + struct flowi4 fl4; + __be16 df = 0; + u8 tos, ttl; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto tx_error; + key = &tun_info->key; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + tos = key->tos; + if (tos == 1) { + if (skb->protocol == htons(ETH_P_IP)) + tos = inner_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + } + init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, + RT_TOS(tos), tunnel->parms.link); + if (tunnel->encap.type != TUNNEL_ENCAP_NONE) + goto tx_error; + rt = ip_route_output_key(tunnel->net, &fl4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (rt->dst.dev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); + ttl = key->ttl; + if (ttl == 0) { + if (skb->protocol == htons(ETH_P_IP)) + ttl = inner_iph->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; + else + ttl = ip4_dst_hoplimit(&rt->dst); + } + if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + else if (skb->protocol == htons(ETH_P_IP)) + df = inner_iph->frag_off & htons(IP_DF); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + if (headroom > dev->needed_headroom) + dev->needed_headroom = headroom; + + if (skb_cow_head(skb, dev->needed_headroom)) { + ip_rt_put(rt); + goto tx_dropped; + } + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, + key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); + return; +tx_error: + dev->stats.tx_errors++; + goto kfree; +tx_dropped: + dev->stats.tx_dropped++; +kfree: + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); + void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4ae3f8e6c6cc..c9392589c415 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -115,6 +115,7 @@ #include #include #include +#include static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); @@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->collect_md) { + tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); + if (!tun_dst) + return 0; + } + return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } return -1; @@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, ipproto); + if (tunnel->collect_md) + ip_md_tunnel_xmit(skb, dev, ipproto); + else + ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: @@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, bool *collect_md) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPIP; parms->iph.ihl = 5; + *collect_md = false; if (!data) return; @@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[], if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_IPTUN_COLLECT_METADATA]) + *collect_md = true; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[], static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; if (ipip_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipip_netlink_parms(data, &p); + ipip_netlink_parms(data, &p, &t->collect_md); return ip_tunnel_newlink(dev, tb, &p); } @@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + bool collect_md; if (ipip_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); @@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], return err; } - ipip_netlink_parms(data, &p); + ipip_netlink_parms(data, &p, &collect_md); + if (collect_md) + return -EINVAL; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) @@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_IPTUN_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) tunnel->encap.flags)) goto nla_put_failure; + if (tunnel->collect_md) + if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) + goto nla_put_failure; return 0; nla_put_failure: @@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { -- cgit v1.2.3 From 8d79266bc48c6ab6477d04e159cabf1e7809cb72 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 15 Sep 2016 13:00:30 -0700 Subject: ip6_tunnel: add collect_md mode to IPv6 tunnels Similar to gre, vxlan, geneve tunnels allow IPIP6 and IP6IP6 tunnels to operate in 'collect metadata' mode. Unlike ipv4 code here it's possible to reuse ip6_tnl_xmit() function for both collect_md and traditional tunnels. bpf_skb_[gs]et_tunnel_key() helpers and ovs (in the future) are the users. Signed-off-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 1 + net/ipv6/ip6_tunnel.c | 178 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 134 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 43a5a0e4524c..20ed9699fcd4 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -23,6 +23,7 @@ struct __ip6_tnl_parm { __u8 proto; /* tunnel protocol */ __u8 encap_limit; /* encapsulation limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */ + bool collect_md; __be32 flowinfo; /* traffic class and flowlabel for tunnel */ __u32 flags; /* tunnel flags */ struct in6_addr laddr; /* local tunnel end-point address */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5c5779720ef1..6a66adba0c22 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -57,6 +57,7 @@ #include #include #include +#include MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); @@ -90,6 +91,7 @@ struct ip6_tnl_net { struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; + struct ip6_tnl __rcu *collect_md_tun; }; static struct net_device_stats *ip6_get_stats(struct net_device *dev) @@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ return t; } + t = rcu_dereference(ip6n->collect_md_tun); + if (t) + return t; + t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; @@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); + if (t->parms.collect_md) + rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; + if (t->parms.collect_md) + rcu_assign_pointer(ip6n->collect_md_tun, NULL); + for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); + if (tun_dst) + skb_dst_set(skb, (struct dst_entry *)tun_dst); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); @@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, + if (t->parms.collect_md) { + tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); + if (!tun_dst) + return 0; + } + ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } @@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, int mtu; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; + u8 hop_limit; int err = -1; + if (t->parms.collect_md) { + hop_limit = skb_tunnel_info(skb)->key.ttl; + goto route_lookup; + } else { + hop_limit = t->parms.hop_limit; + } + /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { struct in6_addr *addr6; @@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, goto tx_err_link_failure; if (!dst) { +route_lookup: dst = ip6_route_output(net, NULL, fl6); if (dst->error) @@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, dst = NULL; goto tx_err_link_failure; } + if (t->parms.collect_md && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6->daddr, 0, &fl6->saddr)) + goto tx_err_link_failure; ndst = dst; } @@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, } if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (skb_dst(skb)) + if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { *pmtu = mtu; @@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, skb = new_skb; } - if (!fl6->flowi6_mark && ndst) - dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); + if (t->parms.collect_md) { + if (t->encap.type != TUNNEL_ENCAP_NONE) + goto tx_err_dst_release; + } else { + if (!fl6->flowi6_mark && ndst) + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); + } skb_dst_set(skb, dst); if (encap_limit >= 0) { @@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); - ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; @@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (tproto != IPPROTO_IPIP && tproto != 0) return -1; - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; + dsfield = ipv4_get_dsfield(iph); - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; - dsfield = ipv4_get_dsfield(iph); + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -1; + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + } else { + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; @@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) ip6_tnl_addr_conflict(t, ipv6h)) return -1; - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); + dsfield = ipv6_get_dsfield(ipv6h); + + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + } else { + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (void *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + encap_limit = t->parms.encap_limit; } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; - dsfield = ipv6_get_dsfield(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; @@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev) if (err) return err; ip6_tnl_link_config(t); + if (t->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } return 0; } @@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + + if (data[IFLA_IPTUN_COLLECT_METADATA]) + parms->collect_md = true; } static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], @@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *nt, *t; struct ip_tunnel_encap ipencap; @@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, ip6_tnl_netlink_parms(data, &nt->parms); - t = ip6_tnl_locate(net, &nt->parms, 0); - if (!IS_ERR(t)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ip6n->collect_md_tun)) + return -EEXIST; + } else { + t = ip6_tnl_locate(net, &nt->parms, 0); + if (!IS_ERR(t)) + return -EEXIST; + } return ip6_tnl_create2(dev); } @@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], return err; } ip6_tnl_netlink_parms(data, &p); + if (p.collect_md) + return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { @@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_IPTUN_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, - tunnel->encap.type) || - nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, - tunnel->encap.sport) || - nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, - tunnel->encap.dport) || - nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.flags)) + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; + if (parm->collect_md) + if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) + goto nla_put_failure; return 0; nla_put_failure: @@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { -- cgit v1.2.3 From b61c654f9b3f1a271217e46c893f80565b1f754d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Sep 2016 02:04:20 +0800 Subject: sctp: free msg->chunks when sctp_primitive_SEND return err Last patch "sctp: do not return the transmit err back to sctp_sendmsg" made sctp_primitive_SEND return err only when asoc state is unavailable. In this case, chunks are not enqueued, they have no chance to be freed if we don't take care of them later. This Patch is actually to revert commit 1cd4d5c4326a ("sctp: remove the unused sctp_datamsg_free()"), commit 69b5777f2e57 ("sctp: hold the chunks only after the chunk is enqueued in outq") and commit 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg"), to use sctp_datamsg_free to free the chunks of current msg. Fixes: 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + net/sctp/chunk.c | 13 +++++++++++++ net/sctp/outqueue.c | 1 - net/sctp/socket.c | 8 ++++++-- 4 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ce93c4b10d26..f61fb7c87e53 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -537,6 +537,7 @@ struct sctp_datamsg { struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, struct sctp_sndrcvinfo *, struct iov_iter *); +void sctp_datamsg_free(struct sctp_datamsg *); void sctp_datamsg_put(struct sctp_datamsg *); void sctp_chunk_fail(struct sctp_chunk *, int error); int sctp_chunk_abandoned(struct sctp_chunk *); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a55e54738b81..af9cc8055465 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -70,6 +70,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) return msg; } +void sctp_datamsg_free(struct sctp_datamsg *msg) +{ + struct sctp_chunk *chunk; + + /* This doesn't have to be a _safe vairant because + * sctp_chunk_free() only drops the refs. + */ + list_for_each_entry(chunk, &msg->chunks, frag_list) + sctp_chunk_free(chunk); + + sctp_datamsg_put(msg); +} + /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index da2418b64c86..6c109b0f8495 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -304,7 +304,6 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); - sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); if (chunk->asoc->prsctp_enable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9fc417a8b476..6cdc61c21438 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1958,6 +1958,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { + sctp_chunk_hold(chunk); + /* Do accounting for the write space. */ sctp_set_owner_w(chunk); @@ -1970,13 +1972,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * breaks. */ err = sctp_primitive_SEND(net, asoc, datamsg); - sctp_datamsg_put(datamsg); /* Did the lower layer accept the chunk? */ - if (err) + if (err) { + sctp_datamsg_free(datamsg); goto out_free; + } pr_debug("%s: we sent primitively\n", __func__); + sctp_datamsg_put(datamsg); err = msg_len; if (unlikely(wait_connect)) { -- cgit v1.2.3 From 83dbc3d4a38411ef38f680d7045c8478cc9c5a56 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Sep 2016 02:04:22 +0800 Subject: sctp: make sctp_outq_flush/tail/uncork return void sctp_outq_flush return value is meaningless now, this patch is to make sctp_outq_flush return void, as well as sctp_outq_fail and sctp_outq_uncork. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- net/sctp/outqueue.c | 19 +++++++------------ net/sctp/sm_sideeffect.c | 9 ++++----- 3 files changed, 13 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index f61fb7c87e53..8693dc452a7f 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1077,7 +1077,7 @@ struct sctp_outq { void sctp_outq_init(struct sctp_association *, struct sctp_outq *); void sctp_outq_teardown(struct sctp_outq *); void sctp_outq_free(struct sctp_outq*); -int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); +void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); void sctp_outq_restart(struct sctp_outq *); @@ -1085,7 +1085,7 @@ void sctp_outq_restart(struct sctp_outq *); void sctp_retransmit(struct sctp_outq *, struct sctp_transport *, sctp_retransmit_reason_t); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); -int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); +void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len); /* Uncork and flush an outqueue. */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 052a4796a457..8c3f446d965c 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q, static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, @@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q) } /* Put a new chunk in an sctp_outq. */ -int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) +void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = sock_net(q->asoc->base.sk); - int error = 0; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? @@ -318,9 +317,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) } if (!q->cork) - error = sctp_outq_flush(q, 0, gfp); - - return error; + sctp_outq_flush(q, 0, gfp); } /* Insert a chunk into the sorted list based on the TSNs. The retransmit list @@ -748,12 +745,12 @@ redo: } /* Cork the outqueue so queued chunks are really queued. */ -int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) +void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; - return sctp_outq_flush(q, 0, gfp); + sctp_outq_flush(q, 0, gfp); } @@ -766,7 +763,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) * locking concerns must be made. Today we use the sock lock to protect * this function. */ -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_packet *packet; struct sctp_packet singleton; @@ -891,7 +888,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) error = sctp_packet_transmit(&singleton, gfp); if (error < 0) { asoc->base.sk->sk_err = -error; - return 0; + return; } break; @@ -1175,8 +1172,6 @@ sctp_flush_out: /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); } - - return 0; } /* Update unack_data based on the incoming SACK chunk */ diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index cf6e4f0de729..c345bf153bed 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1421,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, local_cork = 1; } /* Send a chunk to our peer. */ - error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, - gfp); + sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp); break; case SCTP_CMD_SEND_PKT: @@ -1676,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; - error = sctp_outq_uncork(&asoc->outqueue, gfp); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; @@ -1733,9 +1732,9 @@ out: */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) - error = sctp_outq_uncork(&asoc->outqueue, gfp); + sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) - error = sctp_outq_uncork(&asoc->outqueue, gfp); + sctp_outq_uncork(&asoc->outqueue, gfp); if (sp->data_ready_signalled) sp->data_ready_signalled = 0; -- cgit v1.2.3 From 2c9d85d4d82d9e0a62aad08bf50650804e68ed30 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Fri, 16 Sep 2016 15:05:36 +0200 Subject: netdevice: Add offload statistics ndo Add a new ndo to return statistics for offloaded operation. Since there can be many different offloaded operation with many stats types, the ndo gets an attribute id by which it knows which stats are wanted. The ndo also gets a void pointer to be cast according to the attribute id. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2095b6ab3661..a10d8d18ce19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -924,6 +924,14 @@ struct netdev_xdp { * 3. Update dev->stats asynchronously and atomically, and define * neither operation. * + * bool (*ndo_has_offload_stats)(int attr_id) + * Return true if this device supports offload stats of this attr_id. + * + * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, + * void *attr_data) + * Get statistics for offload operations by attr_id. Write it into the + * attr_data pointer. + * * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is registered. @@ -1155,6 +1163,10 @@ struct net_device_ops { struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); + bool (*ndo_has_offload_stats)(int attr_id); + int (*ndo_get_offload_stats)(int attr_id, + const struct net_device *dev, + void *attr_data); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); int (*ndo_vlan_rx_add_vid)(struct net_device *dev, -- cgit v1.2.3 From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Fri, 16 Sep 2016 15:05:37 +0200 Subject: net: core: Add offload stats to if_stats_msg Add a nested attribute of offload stats to if_stats_msg named IFLA_STATS_LINK_OFFLOAD_XSTATS. Under it, add SW stats, meaning stats only per packets that went via slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 9 ++++ net/core/rtnetlink.c | 111 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 116 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 9bf3aecfe05b..2351776a724f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -826,6 +826,7 @@ enum { IFLA_STATS_LINK_64, IFLA_STATS_LINK_XSTATS, IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, __IFLA_STATS_MAX, }; @@ -845,6 +846,14 @@ enum { }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + /* XDP section */ enum { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 937e459bdaa9..0dbae4244a89 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } +#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) +static int rtnl_get_offload_stats_attr_size(int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return sizeof(struct rtnl_link_stats64); + } + + return 0; +} + +static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, + int *prividx) +{ + struct nlattr *attr = NULL; + int attr_id, size; + void *attr_data; + int err; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return -ENODATA; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (attr_id < *prividx) + continue; + + size = rtnl_get_offload_stats_attr_size(attr_id); + if (!size) + continue; + + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, + attr_data); + if (err) + goto get_offload_stats_failure; + } + + if (!attr) + return -ENODATA; + + *prividx = 0; + return 0; + +nla_put_failure: + err = -EMSGSIZE; +get_offload_stats_failure: + *prividx = attr_id; + return err; +} + +static int rtnl_get_offload_stats_size(const struct net_device *dev) +{ + int nla_size = 0; + int attr_id; + int size; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return 0; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + size = rtnl_get_offload_stats_attr_size(attr_id); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, @@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; + int err; ASSERT_RTNL(); @@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS); @@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS_SLAVE); @@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) + goto nla_put_failure; + + err = rtnl_get_offload_stats(skb, dev, prividx); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) + size += rtnl_get_offload_stats_size(dev); + return size; } -- cgit v1.2.3 From d409b84768037ad03d1d73538d99fb902adf7365 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:08 -0700 Subject: ipv6: Export p6_route_input_lookup symbol Make ip6_route_input_lookup available outside of ipv6 the module similar to ip_route_input_noref in the IPv4 world. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +++ net/ipv6/route.c | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d97305d0e71f..e0cd318d5103 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) } void ip6_route_input(struct sk_buff *skb); +struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad4a7ff301fc..4dab585f7642 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table * return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); } -static struct dst_entry *ip6_route_input_lookup(struct net *net, - struct net_device *dev, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); } +EXPORT_SYMBOL_GPL(ip6_route_input_lookup); void ip6_route_input(struct sk_buff *skb) { -- cgit v1.2.3 From e8bffe0cf964f0330595bb376b74921cccdaac88 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:13 -0700 Subject: net: Add _nf_(un)register_hooks symbols Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow caller to hold RTNL mutex. Signed-off-by: Mahesh Bandewar CC: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 2 ++ net/netfilter/core.c | 51 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9230f9aee896..e82b76781bf6 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg); void nf_unregister_hook(struct nf_hook_ops *reg); int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f39276d1c2d7..2c5327e43a88 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks); static LIST_HEAD(nf_hook_list); -int nf_register_hook(struct nf_hook_ops *reg) +static int _nf_register_hook(struct nf_hook_ops *reg) { struct net *net, *last; int ret; - rtnl_lock(); for_each_net(net) { ret = nf_register_net_hook(net, reg); if (ret && ret != -ENOENT) goto rollback; } list_add_tail(®->list, &nf_hook_list); - rtnl_unlock(); return 0; rollback: @@ -210,19 +208,34 @@ rollback: break; nf_unregister_net_hook(net, reg); } + return ret; +} + +int nf_register_hook(struct nf_hook_ops *reg) +{ + int ret; + + rtnl_lock(); + ret = _nf_register_hook(reg); rtnl_unlock(); + return ret; } EXPORT_SYMBOL(nf_register_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +static void _nf_unregister_hook(struct nf_hook_ops *reg) { struct net *net; - rtnl_lock(); list_del(®->list); for_each_net(net) nf_unregister_net_hook(net, reg); +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + rtnl_lock(); + _nf_unregister_hook(reg); rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -246,6 +259,26 @@ err: } EXPORT_SYMBOL(nf_register_hooks); +/* Caller MUST take rtnl_lock() */ +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = _nf_register_hook(®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + _nf_unregister_hooks(reg, i); + return err; +} +EXPORT_SYMBOL(_nf_register_hooks); + void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) { while (n-- > 0) @@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); +/* Caller MUST take rtnl_lock */ +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + while (n-- > 0) + _nf_unregister_hook(®[n]); +} +EXPORT_SYMBOL(_nf_unregister_hooks); + unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, struct nf_hook_state *state, -- cgit v1.2.3 From 4fbae7d83c98c30efcf0a2a2ac55fbb75ef5a1a5 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:19 -0700 Subject: ipvlan: Introduce l3s mode In a typical IPvlan L3 setup where master is in default-ns and each slave is into different (slave) ns. In this setup egress packet processing for traffic originating from slave-ns will hit all NF_HOOKs in slave-ns as well as default-ns. However same is not true for ingress processing. All these NF_HOOKs are hit only in the slave-ns skipping them in the default-ns. IPvlan in L3 mode is restrictive and if admins want to deploy iptables rules in default-ns, this asymmetric data path makes it impossible to do so. This patch makes use of the l3_rcv() (added as part of l3mdev enhancements) to perform input route lookup on RX packets without changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN to change the skb->dev just before handing over skb to L4. Signed-off-by: Mahesh Bandewar CC: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ipvlan.txt | 7 ++- drivers/net/Kconfig | 1 + drivers/net/ipvlan/ipvlan.h | 6 +++ drivers/net/ipvlan/ipvlan_core.c | 94 +++++++++++++++++++++++++++++++++++++ drivers/net/ipvlan/ipvlan_main.c | 87 +++++++++++++++++++++++++++++++--- include/uapi/linux/if_link.h | 1 + 6 files changed, 188 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt index 14422f8fcdc4..24196cef7c91 100644 --- a/Documentation/networking/ipvlan.txt +++ b/Documentation/networking/ipvlan.txt @@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module There are no module parameters for this driver and it can be configured using IProute2/ip utility. - ip link add link type ipvlan mode { l2 | L3 } + ip link add link type ipvlan mode { l2 | l3 | l3s } e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 @@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be used before packets are queued on the outbound device. In this mode the slaves will not receive nor can send multicast / broadcast traffic. +4.3 L3S mode: + This is very similar to the L3 mode except that iptables (conn-tracking) +works in this mode and hence it is L3-symmetric (L3s). This will have slightly less +performance but that shouldn't matter since you are choosing this mode over plain-L3 +mode to make conn-tracking work. 5. What to choose (macvlan vs. ipvlan)? These two devices are very similar in many regards and the specific use diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 0c5415b05ea9..8768a625350d 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -149,6 +149,7 @@ config IPVLAN tristate "IP-VLAN support" depends on INET depends on IPV6 + depends on NET_L3_MASTER_DEV ---help--- This allows one to create virtual devices off of a main interface and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 695a5dc9ace3..7e0732f5ea07 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" @@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); +struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, + u16 proto); +unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); #endif /* __IPVLAN_H */ diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index b5f9511d819e..b4e990743e1d 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: + case IPVLAN_MODE_L3S: return ipvlan_xmit_mode_l3(skb, dev); } @@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); + case IPVLAN_MODE_L3S: + return RX_HANDLER_PASS; } /* Should not reach here */ @@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) kfree_skb(skb); return RX_HANDLER_CONSUMED; } + +static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, + struct net_device *dev) +{ + struct ipvl_addr *addr = NULL; + struct ipvl_port *port; + void *lyr3h; + int addr_type; + + if (!dev || !netif_is_ipvlan_port(dev)) + goto out; + + port = ipvlan_port_get_rcu(dev); + if (!port || port->mode != IPVLAN_MODE_L3S) + goto out; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); +out: + return addr; +} + +struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, + u16 proto) +{ + struct ipvl_addr *addr; + struct net_device *sdev; + + addr = ipvlan_skb_to_addr(skb, dev); + if (!addr) + goto out; + + sdev = addr->master->dev; + switch (proto) { + case AF_INET: + { + int err; + struct iphdr *ip4h = ip_hdr(skb); + + err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, + ip4h->tos, sdev); + if (unlikely(err)) + goto out; + break; + } + case AF_INET6: + { + struct dst_entry *dst; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct flowi6 fl6 = { + .flowi6_iif = sdev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + skb_dst_drop(skb); + dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); + skb_dst_set(skb, dst); + break; + } + default: + break; + } + +out: + return skb; +} + +unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct ipvl_addr *addr; + unsigned int len; + + addr = ipvlan_skb_to_addr(skb, skb->dev); + if (!addr) + goto out; + + skb->dev = addr->master->dev; + len = skb->len + ETH_HLEN; + ipvlan_count_rx(addr->master, len, true, false); +out: + return NF_ACCEPT; +} diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 18b4e8c7f68a..f442eb366863 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -9,24 +9,87 @@ #include "ipvlan.h" +static u32 ipvl_nf_hook_refcnt = 0; + +static struct nf_hook_ops ipvl_nfops[] __read_mostly = { + { + .hook = ipvlan_nf_input, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = INT_MAX, + }, + { + .hook = ipvlan_nf_input, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = INT_MAX, + }, +}; + +static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = { + .l3mdev_l3_rcv = ipvlan_l3_rcv, +}; + static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) { ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; } -static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) +static int ipvlan_register_nf_hook(void) +{ + int err = 0; + + if (!ipvl_nf_hook_refcnt) { + err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); + if (!err) + ipvl_nf_hook_refcnt = 1; + } else { + ipvl_nf_hook_refcnt++; + } + + return err; +} + +static void ipvlan_unregister_nf_hook(void) +{ + WARN_ON(!ipvl_nf_hook_refcnt); + + ipvl_nf_hook_refcnt--; + if (!ipvl_nf_hook_refcnt) + _nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); +} + +static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) { struct ipvl_dev *ipvlan; + struct net_device *mdev = port->dev; + int err = 0; + ASSERT_RTNL(); if (port->mode != nval) { + if (nval == IPVLAN_MODE_L3S) { + /* New mode is L3S */ + err = ipvlan_register_nf_hook(); + if (!err) { + mdev->l3mdev_ops = &ipvl_l3mdev_ops; + mdev->priv_flags |= IFF_L3MDEV_MASTER; + } else + return err; + } else if (port->mode == IPVLAN_MODE_L3S) { + /* Old mode was L3S */ + mdev->priv_flags &= ~IFF_L3MDEV_MASTER; + ipvlan_unregister_nf_hook(); + mdev->l3mdev_ops = NULL; + } list_for_each_entry(ipvlan, &port->ipvlans, pnode) { - if (nval == IPVLAN_MODE_L3) + if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) ipvlan->dev->flags |= IFF_NOARP; else ipvlan->dev->flags &= ~IFF_NOARP; } port->mode = nval; } + return err; } static int ipvlan_port_create(struct net_device *dev) @@ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev) struct ipvl_port *port = ipvlan_port_get_rtnl(dev); dev->priv_flags &= ~IFF_IPVLAN_MASTER; + if (port->mode == IPVLAN_MODE_L3S) { + dev->priv_flags &= ~IFF_L3MDEV_MASTER; + ipvlan_unregister_nf_hook(); + dev->l3mdev_ops = NULL; + } netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); __skb_queue_purge(&port->backlog); @@ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev) struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_addr *addr; - if (ipvlan->port->mode == IPVLAN_MODE_L3) + if (ipvlan->port->mode == IPVLAN_MODE_L3 || + ipvlan->port->mode == IPVLAN_MODE_L3S) dev->flags |= IFF_NOARP; else dev->flags &= ~IFF_NOARP; @@ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev, { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); + int err = 0; if (data && data[IFLA_IPVLAN_MODE]) { u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); - ipvlan_set_port_mode(port, nmode); + err = ipvlan_set_port_mode(port, nmode); } - return 0; + return err; } static size_t ipvlan_nl_getsize(const struct net_device *dev) @@ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, unregister_netdevice(dev); return err; } + err = ipvlan_set_port_mode(port, mode); + if (err) { + unregister_netdevice(dev); + return err; + } list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); - ipvlan_set_port_mode(port, mode); - netif_stacked_transfer_operstate(phy_dev, dev); return 0; } diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2351776a724f..7ec9e99d5491 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -464,6 +464,7 @@ enum { enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, IPVLAN_MODE_MAX }; -- cgit v1.2.3 From ec323368793b8570c02e723127611a8d906a9b3f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:32 +0200 Subject: sched: remove qdisc arg from __qdisc_dequeue_head Moves qdisc stat accouting to qdisc_dequeue_head. The only direct caller of the __qdisc_dequeue_head version open-codes this now. This allows us to later use __qdisc_dequeue_head as a replacement of __skb_dequeue() (which operates on sk_buff_head list). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 15 ++++++++------- net/sched/sch_generic.c | 7 ++++++- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 52a2015667b4..0741ed41575b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -614,11 +614,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch) return __qdisc_enqueue_tail(skb, sch, &sch->q); } -static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch, - struct sk_buff_head *list) +static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list) { struct sk_buff *skb = __skb_dequeue(list); + return skb; +} + +static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) +{ + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); @@ -627,11 +633,6 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch, return skb; } -static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) -{ - return __qdisc_dequeue_head(sch, &sch->q); -} - /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5e63bf638350..73877d9c2bcb 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -506,7 +506,12 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(band >= 0)) { struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + struct sk_buff *skb = __qdisc_dequeue_head(list); + + if (likely(skb != NULL)) { + qdisc_qstats_backlog_dec(qdisc, skb); + qdisc_bstats_update(qdisc, skb); + } qdisc->q.qlen--; if (skb_queue_empty(list)) -- cgit v1.2.3 From 48da34b7a74201f15315cb1fc40bb9a7bd2b4940 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:34 +0200 Subject: sched: add and use qdisc_skb_head helpers This change replaces sk_buff_head struct in Qdiscs with new qdisc_skb_head. Its similar to the skb_buff_head api, but does not use skb->prev pointers. Qdiscs will commonly enqueue at the tail of a list and dequeue at head. While skb_buff_head works fine for this, enqueue/dequeue needs to also adjust the prev pointer of next element. The ->prev pointer is not required for qdiscs so we can just leave it undefined and avoid one cacheline write access for en/dequeue. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 63 ++++++++++++++++++++++++++++++++++++++--------- net/sched/sch_generic.c | 21 ++++++++-------- net/sched/sch_htb.c | 24 +++++++++++++++--- net/sched/sch_netem.c | 14 +++++++++-- 4 files changed, 94 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0741ed41575b..e6aa0a249672 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -36,6 +36,14 @@ struct qdisc_size_table { u16 data[]; }; +/* similar to sk_buff_head, but skb->prev pointer is undefined. */ +struct qdisc_skb_head { + struct sk_buff *head; + struct sk_buff *tail; + __u32 qlen; + spinlock_t lock; +}; + struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -76,7 +84,7 @@ struct Qdisc { * For performance sake on SMP, we put highly modified fields at the end */ struct sk_buff *gso_skb ____cacheline_aligned_in_smp; - struct sk_buff_head q; + struct qdisc_skb_head q; struct gnet_stats_basic_packed bstats; seqcount_t running; struct gnet_stats_queue qstats; @@ -600,10 +608,27 @@ static inline void qdisc_qstats_overlimit(struct Qdisc *sch) sch->qstats.overlimits++; } +static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh) +{ + qh->head = NULL; + qh->tail = NULL; + qh->qlen = 0; +} + static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff_head *list) + struct qdisc_skb_head *qh) { - __skb_queue_tail(list, skb); + struct sk_buff *last = qh->tail; + + if (last) { + skb->next = NULL; + last->next = skb; + qh->tail = skb; + } else { + qh->tail = skb; + qh->head = skb; + } + qh->qlen++; qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; @@ -614,9 +639,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch) return __qdisc_enqueue_tail(skb, sch, &sch->q); } -static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list) +static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) { - struct sk_buff *skb = __skb_dequeue(list); + struct sk_buff *skb = qh->head; + + if (likely(skb != NULL)) { + qh->head = skb->next; + qh->qlen--; + if (qh->head == NULL) + qh->tail = NULL; + skb->next = NULL; + } return skb; } @@ -643,10 +676,10 @@ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) } static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, - struct sk_buff_head *list, + struct qdisc_skb_head *qh, struct sk_buff **to_free) { - struct sk_buff *skb = __skb_dequeue(list); + struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); @@ -667,7 +700,9 @@ static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch, static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { - return skb_peek(&sch->q); + const struct qdisc_skb_head *qh = &sch->q; + + return qh->head; } /* generic pseudo peek method for non-work-conserving qdisc */ @@ -702,15 +737,19 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) return skb; } -static inline void __qdisc_reset_queue(struct sk_buff_head *list) +static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ - if (!skb_queue_empty(list)) { - rtnl_kfree_skbs(list->next, list->prev); - __skb_queue_head_init(list); + ASSERT_RTNL(); + if (qh->qlen) { + rtnl_kfree_skbs(qh->head, qh->tail); + + qh->head = NULL; + qh->tail = NULL; + qh->qlen = 0; } } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 73877d9c2bcb..6cfb6e9038c2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { */ struct pfifo_fast_priv { u32 bitmap; - struct sk_buff_head q[PFIFO_FAST_BANDS]; + struct qdisc_skb_head q[PFIFO_FAST_BANDS]; }; /* @@ -477,7 +477,7 @@ struct pfifo_fast_priv { */ static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, +static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, int band) { return priv->q + band; @@ -489,7 +489,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct sk_buff_head *list = band2list(priv, band); + struct qdisc_skb_head *list = band2list(priv, band); priv->bitmap |= (1 << band); qdisc->q.qlen++; @@ -505,8 +505,8 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) int band = bitmap2band[priv->bitmap]; if (likely(band >= 0)) { - struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(list); + struct qdisc_skb_head *qh = band2list(priv, band); + struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(qdisc, skb); @@ -514,7 +514,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) } qdisc->q.qlen--; - if (skb_queue_empty(list)) + if (qh->qlen == 0) priv->bitmap &= ~(1 << band); return skb; @@ -529,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) int band = bitmap2band[priv->bitmap]; if (band >= 0) { - struct sk_buff_head *list = band2list(priv, band); + struct qdisc_skb_head *qh = band2list(priv, band); - return skb_peek(list); + return qh->head; } return NULL; @@ -569,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __skb_queue_head_init(band2list(priv, prio)); + qdisc_skb_head_init(band2list(priv, prio)); /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; @@ -617,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } - skb_queue_head_init(&sch->q); + qdisc_skb_head_init(&sch->q); + spin_lock_init(&sch->q.lock); spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 53dbfa187870..c798d0de8a9d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -162,7 +162,7 @@ struct htb_sched { struct work_struct work; /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; + struct qdisc_skb_head direct_queue; long direct_pkts; struct qdisc_watchdog watchdog; @@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) list_del_init(&cl->un.leaf.drop_list); } +static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, + struct qdisc_skb_head *qh) +{ + struct sk_buff *last = qh->tail; + + if (last) { + skb->next = NULL; + last->next = skb; + qh->tail = skb; + } else { + qh->tail = skb; + qh->head = skb; + } + qh->qlen++; +} + static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == HTB_DIRECT) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen) { - __skb_queue_tail(&q->direct_queue, skb); + htb_enqueue_tail(skb, sch, &q->direct_queue); q->direct_pkts++; } else { return qdisc_drop(skb, sch, to_free); @@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) unsigned long start_at; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ - skb = __skb_dequeue(&q->direct_queue); + skb = __qdisc_dequeue_head(&q->direct_queue); if (skb != NULL) { ok: qdisc_bstats_update(sch, skb); @@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); - __skb_queue_head_init(&q->direct_queue); + qdisc_skb_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0a964b35f8c7..9f7b380cf0a3 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, return segs; } +static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) +{ + skb->next = qh->head; + + if (!qh->head) + qh->tail = skb; + qh->head = skb; + qh->qlen++; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -523,7 +533,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *last; if (sch->q.qlen) - last = skb_peek_tail(&sch->q); + last = sch->q.tail; else last = netem_rb_to_skb(rb_last(&q->t_root)); if (last) { @@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb->time_to_send = psched_get_time(); q->counter = 0; - __skb_queue_head(&sch->q, skb); + netem_enqueue_skb_head(&sch->q, skb); sch->qstats.requeues++; } -- cgit v1.2.3 From 53f863a66904542b03204f2b115d050b04c11ba5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 21 Jul 2016 14:12:40 +0200 Subject: Bluetooth: Put led_trigger field behind CONFIG_BT_LEDS The led_trigger field in hci_dev should be conditional based on if CONFIG_BT_LEDS is set or not. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ee7fc47680a1..b8d43bd9c71b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -399,7 +399,9 @@ struct hci_dev { struct delayed_work rpa_expired; bdaddr_t rpa; +#if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; +#endif int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); -- cgit v1.2.3 From 65010e68efbeda4275845240869138c0c4587422 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 12 Aug 2016 17:01:27 -0700 Subject: Bluetooth: Add HCI device identifier for Qualcomm SMD This patch assigns the next free HCI device identifier to Bluetooth devices based on the Qualcomm Shared Memory channels. Signed-off-by: Bjorn Andersson Signed-off-by: Bjorn Andersson Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 003b25283407..0aac123b5eee 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -63,6 +63,7 @@ #define HCI_SDIO 6 #define HCI_SPI 7 #define HCI_I2C 8 +#define HCI_SMD 9 /* HCI controller types */ #define HCI_PRIMARY 0x00 -- cgit v1.2.3 From 1aabbbcefe8e62fbffaaa01ca8bdd4cd6ed1625b Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Fri, 29 Jul 2016 13:28:25 +0200 Subject: Bluetooth: add printf format attribute to hci_set_[fh]w_info() Commit 5177a83827cd ("Bluetooth: Add debugfs fields for hardware and firmware info") introduced hci_set_hw_info() and hci_set_fw_info(). These functions use kvasprintf_const() but are not marked with a __printf attribute. Adding such an attribute helps detecting issues related to printf-formatting at build time. Signed-off-by: Nicolas Iooss Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b8d43bd9c71b..cc349f633570 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1028,8 +1028,8 @@ int hci_resume_dev(struct hci_dev *hdev); int hci_reset_dev(struct hci_dev *hdev); int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); -void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...); -void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...); +__printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...); +__printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...); int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); -- cgit v1.2.3 From 70ecce91e3a2d7e332fe56fd065c67d404b8fccf Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:38 +0200 Subject: Bluetooth: Store control socket cookie and comm information To further allow unique identification and tracking of control socket, store cookie and comm information when binding the socket. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/hci_sock.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index bfd1590821d6..69b5174168b7 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -371,6 +371,7 @@ void hci_sock_set_flag(struct sock *sk, int nr); void hci_sock_clear_flag(struct sock *sk, int nr); int hci_sock_test_flag(struct sock *sk, int nr); unsigned short hci_sock_get_channel(struct sock *sk); +u32 hci_sock_get_cookie(struct sock *sk); int hci_sock_init(void); void hci_sock_cleanup(void); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 99dd1503ef56..4dce6dfdb0f2 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -38,6 +39,8 @@ static LIST_HEAD(mgmt_chan_list); static DEFINE_MUTEX(mgmt_chan_list_lock); +static DEFINE_IDA(sock_cookie_ida); + static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ @@ -52,6 +55,8 @@ struct hci_pinfo { __u32 cmsg_mask; unsigned short channel; unsigned long flags; + __u32 cookie; + char comm[TASK_COMM_LEN]; }; void hci_sock_set_flag(struct sock *sk, int nr) @@ -74,6 +79,11 @@ unsigned short hci_sock_get_channel(struct sock *sk) return hci_pi(sk)->channel; } +u32 hci_sock_get_cookie(struct sock *sk) +{ + return hci_pi(sk)->cookie; +} + static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); @@ -585,6 +595,7 @@ static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; + int id; BT_DBG("sock %p sk %p", sock, sk); @@ -593,8 +604,17 @@ static int hci_sock_release(struct socket *sock) hdev = hci_pi(sk)->hdev; - if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR) + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); + break; + case HCI_CHANNEL_CONTROL: + id = hci_pi(sk)->cookie; + + hci_pi(sk)->cookie = 0xffffffff; + ida_simple_remove(&sock_cookie_ida, id); + break; + } bt_sock_unlink(&hci_sk_list, sk); @@ -957,6 +977,15 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * are changes to settings, class of device, name etc. */ if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + int id; + + id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); + if (id < 0) + id = 0xffffffff; + + hci_pi(sk)->cookie = id; + get_task_comm(hci_pi(sk)->comm, current); + hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); -- cgit v1.2.3 From 03c979c4717c7fa0c058fafe76ac4d6acdd1fb0d Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:39 +0200 Subject: Bluetooth: Introduce helper to pack mgmt version information The mgmt version information will be also needed for the control changell tracing feature. This provides a helper to pack them. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/mgmt.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cc349f633570..9f181b583b96 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1451,6 +1451,7 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); #define DISCOV_BREDR_INQUIRY_LEN 0x08 #define DISCOV_LE_RESTART_DELAY msecs_to_jiffies(200) /* msec */ +void mgmt_fill_version_info(void *ver); int mgmt_new_settings(struct hci_dev *hdev); void mgmt_index_added(struct hci_dev *hdev); void mgmt_index_removed(struct hci_dev *hdev); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7639290b6de3..9071886df194 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -278,6 +278,14 @@ static u8 le_addr_type(u8 mgmt_addr_type) return ADDR_LE_DEV_RANDOM; } +void mgmt_fill_version_info(void *ver) +{ + struct mgmt_rp_read_version *rp = ver; + + rp->version = MGMT_VERSION; + rp->revision = cpu_to_le16(MGMT_REVISION); +} + static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -285,8 +293,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("sock %p", sk); - rp.version = MGMT_VERSION; - rp.revision = cpu_to_le16(MGMT_REVISION); + mgmt_fill_version_info(&rp); return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, &rp, sizeof(rp)); -- cgit v1.2.3 From 249fa1699f8642c73eb43e61b321969f0549ab2c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:40 +0200 Subject: Bluetooth: Add support for sending MGMT open and close to monitor This sends new notifications to the monitor support whenever a management channel has been opened or closed. This allows tracing of control channels really easily. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_mon.h | 2 + net/bluetooth/hci_sock.c | 95 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 587d0131b349..9640790cbbcc 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -45,6 +45,8 @@ struct hci_mon_hdr { #define HCI_MON_VENDOR_DIAG 11 #define HCI_MON_SYSTEM_NOTE 12 #define HCI_MON_USER_LOGGING 13 +#define HCI_MON_CTRL_OPEN 14 +#define HCI_MON_CTRL_CLOSE 15 struct hci_mon_new_index { __u8 type; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 4dce6dfdb0f2..2d8725006838 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -394,6 +394,59 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) return skb; } +static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + u16 format = 0x0002; + u8 ver[3]; + u32 flags; + + skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); + if (!skb) + return NULL; + + mgmt_fill_version_info(ver); + flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(format, skb_put(skb, 2)); + memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver)); + put_unaligned_le32(flags, skb_put(skb, 4)); + *skb_put(skb, 1) = TASK_COMM_LEN; + memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + +static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(4, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { @@ -468,6 +521,29 @@ static void send_monitor_replay(struct sock *sk) read_unlock(&hci_dev_list_lock); } +static void send_monitor_control_replay(struct sock *mon_sk) +{ + struct sock *sk; + + read_lock(&hci_sk_list.lock); + + sk_for_each(sk, &hci_sk_list.head) { + struct sk_buff *skb; + + if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) + continue; + + skb = create_monitor_ctrl_open(sk); + if (!skb) + continue; + + if (sock_queue_rcv_skb(mon_sk, skb)) + kfree_skb(skb); + } + + read_unlock(&hci_sk_list.lock); +} + /* Generate internal stack event */ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) { @@ -595,6 +671,7 @@ static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; + struct sk_buff *skb; int id; BT_DBG("sock %p sk %p", sock, sk); @@ -611,6 +688,14 @@ static int hci_sock_release(struct socket *sock) case HCI_CHANNEL_CONTROL: id = hci_pi(sk)->cookie; + /* Send event to monitor */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + hci_pi(sk)->cookie = 0xffffffff; ida_simple_remove(&sock_cookie_ida, id); break; @@ -931,6 +1016,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, send_monitor_note(sk, "Bluetooth subsystem version %s", BT_SUBSYS_VERSION); send_monitor_replay(sk); + send_monitor_control_replay(sk); atomic_inc(&monitor_promisc); break; @@ -977,6 +1063,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * are changes to settings, class of device, name etc. */ if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + struct sk_buff *skb; int id; id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); @@ -986,6 +1073,14 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, hci_pi(sk)->cookie = id; get_task_comm(hci_pi(sk)->comm, current); + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); -- cgit v1.2.3 From 38ceaa00d02dceb22c6bdd5268f5a44d5c00e123 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:41 +0200 Subject: Bluetooth: Add support for sending MGMT commands and events to monitor This adds support for tracing all management commands and events via the monitor interface. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 3 ++ include/net/bluetooth/hci_mon.h | 2 + net/bluetooth/hci_sock.c | 94 ++++++++++++++++++++++++++++++++++++++++ net/bluetooth/mgmt_util.c | 66 ++++++++++++++++++++++++++-- 4 files changed, 162 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9f181b583b96..a48f71d73dc8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1406,6 +1406,9 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb); void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk); void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb); +void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, + void *data, u16 data_len, ktime_t tstamp, + int flag, struct sock *skip_sk); void hci_sock_dev_event(struct hci_dev *hdev, int event); diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 9640790cbbcc..240786b04a46 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -47,6 +47,8 @@ struct hci_mon_hdr { #define HCI_MON_USER_LOGGING 13 #define HCI_MON_CTRL_OPEN 14 #define HCI_MON_CTRL_CLOSE 15 +#define HCI_MON_CTRL_COMMAND 16 +#define HCI_MON_CTRL_EVENT 17 struct hci_mon_new_index { __u8 type; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 2d8725006838..576ea48631b9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -315,6 +315,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb_copy); } +void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, + void *data, u16 data_len, ktime_t tstamp, + int flag, struct sock *skip_sk) +{ + struct sock *sk; + __le16 index; + + if (hdev) + index = cpu_to_le16(hdev->id); + else + index = cpu_to_le16(MGMT_INDEX_NONE); + + read_lock(&hci_sk_list.lock); + + sk_for_each(sk, &hci_sk_list.head) { + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) + continue; + + /* Ignore socket without the flag set */ + if (!hci_sock_test_flag(sk, flag)) + continue; + + /* Skip the original socket */ + if (sk == skip_sk) + continue; + + skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC); + if (!skb) + continue; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(event, skb_put(skb, 2)); + + if (data) + memcpy(skb_put(skb, data_len), data, data_len); + + skb->tstamp = tstamp; + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); + hdr->index = index; + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + + read_unlock(&hci_sk_list.lock); +} + static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; @@ -447,6 +501,33 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) return skb; } +static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, + u16 opcode, u16 len, + const void *buf) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(6 + len, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(opcode, skb_put(skb, 2)); + + if (buf) + memcpy(skb_put(skb, len), buf, len); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { @@ -1257,6 +1338,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, goto done; } + if (chan->channel == HCI_CHANNEL_CONTROL) { + struct sk_buff *skb; + + /* Send event to monitor */ + skb = create_monitor_ctrl_command(sk, index, opcode, len, + buf + sizeof(*hdr)); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + if (opcode >= chan->handler_count || chan->handlers[opcode].func == NULL) { BT_DBG("Unknown op %u", opcode); diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index 8c30c7eb8bef..c933bd08c1fe 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -21,12 +21,41 @@ SOFTWARE IS DISCLAIMED. */ +#include + #include #include +#include #include #include "mgmt_util.h" +static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie, + u16 opcode, u16 len, void *buf) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(6 + len, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(cookie, skb_put(skb, 4)); + put_unaligned_le16(opcode, skb_put(skb, 2)); + + if (buf) + memcpy(skb_put(skb, len), buf, len); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); + hdr->index = index; + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, void *data, u16 data_len, int flag, struct sock *skip_sk) { @@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, __net_timestamp(skb); hci_send_to_channel(channel, skb, flag, skip_sk); - kfree_skb(skb); + if (channel == HCI_CHANNEL_CONTROL) + hci_send_monitor_ctrl_event(hdev, event, data, data_len, + skb_get_ktime(skb), flag, skip_sk); + + kfree_skb(skb); return 0; } int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { - struct sk_buff *skb; + struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_status *ev; int err; @@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) ev->status = status; ev->opcode = cpu_to_le16(cmd); + mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), + MGMT_EV_CMD_STATUS, sizeof(*ev), ev); + if (mskb) + skb->tstamp = mskb->tstamp; + else + __net_timestamp(skb); + err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); + if (mskb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(mskb); + } + return err; } int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, void *rp, size_t rp_len) { - struct sk_buff *skb; + struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_complete *ev; int err; @@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, if (rp) memcpy(ev->data, rp, rp_len); + mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), + MGMT_EV_CMD_COMPLETE, + sizeof(*ev) + rp_len, ev); + if (mskb) + skb->tstamp = mskb->tstamp; + else + __net_timestamp(skb); + err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); + if (mskb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(mskb); + } + return err; } -- cgit v1.2.3 From 5504c3a31061704512707bb23bd7835e8a5281e4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 29 Aug 2016 06:19:46 +0200 Subject: Bluetooth: Use individual flags for certain management events Instead of hiding everything behind a general managment events flag, introduce indivdual flags that allow fine control over which events are send to a given management channel. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 5 ++++- net/bluetooth/hci_sock.c | 5 ++++- net/bluetooth/mgmt.c | 32 +++++++++++++------------------- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0aac123b5eee..ddb9accac3a5 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -208,7 +208,10 @@ enum { HCI_MGMT_INDEX_EVENTS, HCI_MGMT_UNCONF_INDEX_EVENTS, HCI_MGMT_EXT_INDEX_EVENTS, - HCI_MGMT_GENERIC_EVENTS, + HCI_MGMT_OPTION_EVENTS, + HCI_MGMT_SETTING_EVENTS, + HCI_MGMT_DEV_CLASS_EVENTS, + HCI_MGMT_LOCAL_NAME_EVENTS, HCI_MGMT_OOB_DATA_EVENTS, }; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 576ea48631b9..d37c2243157b 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1164,7 +1164,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); - hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); } break; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f9af5f7c2ea2..469f5cc3109b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -256,13 +256,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data, flag, skip_sk); } -static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data, - u16 len, struct sock *skip_sk) -{ - return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, - HCI_MGMT_GENERIC_EVENTS, skip_sk); -} - static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, struct sock *skip_sk) { @@ -579,8 +572,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip) { __le32 options = get_missing_options(hdev); - return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, - sizeof(options), skip); + return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, + sizeof(options), HCI_MGMT_OPTION_EVENTS, skip); } static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) @@ -1007,8 +1000,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) { __le32 ev = cpu_to_le32(get_current_settings(hdev)); - return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, - sizeof(ev), skip); + return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, + sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip); } int mgmt_new_settings(struct hci_dev *hdev) @@ -3000,8 +2993,8 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, if (err < 0) goto failed; - err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, - data, len, sk); + err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, + len, HCI_MGMT_LOCAL_NAME_EVENTS, sk); goto failed; } @@ -6502,8 +6495,9 @@ void __mgmt_power_off(struct hci_dev *hdev) mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) - mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - zero_cod, sizeof(zero_cod), NULL); + mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, + zero_cod, sizeof(zero_cod), + HCI_MGMT_DEV_CLASS_EVENTS, NULL); new_settings(hdev, match.sk); @@ -7100,8 +7094,8 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); if (!status) - mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - dev_class, 3, NULL); + mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, + 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL); if (match.sk) sock_put(match.sk); @@ -7130,8 +7124,8 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) return; } - mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), - cmd ? cmd->sk : NULL); + mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), + HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) -- cgit v1.2.3 From 9e8305b39bfa23a83b932007654097f4676c2ba2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:35 +0200 Subject: Bluetooth: Use numbers for subsystem version string Instead of keeping a version string around, use version and revision numbers and then stringify them for use as module parameter. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 3 ++- net/bluetooth/af_bluetooth.c | 10 +++++++--- net/bluetooth/hci_sock.c | 4 ++-- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 69b5174168b7..d705bcf40710 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,7 +29,8 @@ #include #include -#define BT_SUBSYS_VERSION "2.21" +#define BT_SUBSYS_VERSION 2 +#define BT_SUBSYS_REVISION 21 #ifndef AF_BLUETOOTH #define AF_BLUETOOTH 31 diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1d96ff3a8d87..1aff2da9bc74 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -713,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = { struct dentry *bt_debugfs; EXPORT_SYMBOL_GPL(bt_debugfs); +#define VERSION __stringify(BT_SUBSYS_VERSION) "." \ + __stringify(BT_SUBSYS_REVISION) + static int __init bt_init(void) { int err; sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); - BT_INFO("Core ver %s", BT_SUBSYS_VERSION); + BT_INFO("Core ver %s", VERSION); err = bt_selftest(); if (err < 0) @@ -797,7 +801,7 @@ subsys_initcall(bt_init); module_exit(bt_exit); MODULE_AUTHOR("Marcel Holtmann "); -MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION); -MODULE_VERSION(BT_SUBSYS_VERSION); +MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); +MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 804208d48368..a4227c777d16 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1117,8 +1117,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, send_monitor_note(sk, "Linux version %s (%s)", init_utsname()->release, init_utsname()->machine); - send_monitor_note(sk, "Bluetooth subsystem version %s", - BT_SUBSYS_VERSION); + send_monitor_note(sk, "Bluetooth subsystem version %u.%u", + BT_SUBSYS_VERSION, BT_SUBSYS_REVISION); send_monitor_replay(sk); send_monitor_control_replay(sk); -- cgit v1.2.3 From 321c6feed2519a2691f65e41c4d62332d6ee3d52 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 1 Sep 2016 16:46:23 +0200 Subject: Bluetooth: Add framework for Extended Controller Information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This command is used to retrieve the current state and basic information of a controller. It is typically used right after getting the response to the Read Controller Index List command or an Index Added event (or its extended counterparts). When any of the values in the EIR_Data field changes, the event Extended Controller Information Changed will be used to inform clients about the updated information. Signed-off-by: Marcel Holtmann Signed-off-by: MichaÅ‚ Narajowski --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/mgmt.h | 18 +++++++++++++ net/bluetooth/mgmt.c | 62 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ddb9accac3a5..99aa5e5e3100 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -208,6 +208,7 @@ enum { HCI_MGMT_INDEX_EVENTS, HCI_MGMT_UNCONF_INDEX_EVENTS, HCI_MGMT_EXT_INDEX_EVENTS, + HCI_MGMT_EXT_INFO_EVENTS, HCI_MGMT_OPTION_EVENTS, HCI_MGMT_SETTING_EVENTS, HCI_MGMT_DEV_CLASS_EVENTS, diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 7647964b1efa..611b243713ea 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -586,6 +586,18 @@ struct mgmt_rp_get_adv_size_info { #define MGMT_OP_START_LIMITED_DISCOVERY 0x0041 +#define MGMT_OP_READ_EXT_INFO 0x0042 +#define MGMT_READ_EXT_INFO_SIZE 0 +struct mgmt_rp_read_ext_info { + bdaddr_t bdaddr; + __u8 version; + __le16 manufacturer; + __le32 supported_settings; + __le32 current_settings; + __le16 eir_len; + __u8 eir[0]; +} __packed; + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; @@ -800,3 +812,9 @@ struct mgmt_ev_advertising_added { struct mgmt_ev_advertising_removed { __u8 instance; } __packed; + +#define MGMT_EV_EXT_INFO_CHANGED 0x0025 +struct mgmt_ev_ext_info_changed { + __le16 eir_len; + __u8 eir[0]; +} __packed; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 47efdb4a669a..69001f415efa 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -104,6 +104,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_REMOVE_ADVERTISING, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_OP_START_LIMITED_DISCOVERY, + MGMT_OP_READ_EXT_INFO, }; static const u16 mgmt_events[] = { @@ -141,6 +142,7 @@ static const u16 mgmt_events[] = { MGMT_EV_LOCAL_OOB_DATA_UPDATED, MGMT_EV_ADVERTISING_ADDED, MGMT_EV_ADVERTISING_REMOVED, + MGMT_EV_EXT_INFO_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -149,6 +151,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_UNCONF_INDEX_LIST, MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, + MGMT_OP_READ_EXT_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -162,6 +165,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_NEW_CONFIG_OPTIONS, MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, + MGMT_EV_EXT_INFO_CHANGED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -862,6 +866,52 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, sizeof(rp)); } +static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_rp_read_ext_info rp; + + BT_DBG("sock %p %s", sk, hdev->name); + + hci_dev_lock(hdev); + + memset(&rp, 0, sizeof(rp)); + + bacpy(&rp.bdaddr, &hdev->bdaddr); + + rp.version = hdev->hci_ver; + rp.manufacturer = cpu_to_le16(hdev->manufacturer); + + rp.supported_settings = cpu_to_le32(get_supported_settings(hdev)); + rp.current_settings = cpu_to_le32(get_current_settings(hdev)); + + rp.eir_len = cpu_to_le16(0); + + hci_dev_unlock(hdev); + + /* If this command is called at least once, then the events + * for class of device and local name changes are disabled + * and only the new extended controller information event + * is used. + */ + hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, &rp, + sizeof(rp)); +} + +static int ext_info_changed(struct hci_dev *hdev, struct sock *skip) +{ + struct mgmt_ev_ext_info_changed ev; + + ev.eir_len = cpu_to_le16(0); + + return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, &ev, + sizeof(ev), HCI_MGMT_EXT_INFO_EVENTS, skip); +} + static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 settings = cpu_to_le32(get_current_settings(hdev)); @@ -2995,6 +3045,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, len, HCI_MGMT_LOCAL_NAME_EVENTS, sk); + ext_info_changed(hdev, sk); goto failed; } @@ -6356,6 +6407,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE }, { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, + { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -6494,10 +6547,12 @@ void __mgmt_power_off(struct hci_dev *hdev) mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); - if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) + if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, zero_cod, sizeof(zero_cod), HCI_MGMT_DEV_CLASS_EVENTS, NULL); + ext_info_changed(hdev, NULL); + } new_settings(hdev, match.sk); @@ -7093,9 +7148,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); - if (!status) + if (!status) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL); + ext_info_changed(hdev, NULL); + } if (match.sk) sock_put(match.sk); @@ -7126,6 +7183,7 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL); + ext_info_changed(hdev, cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) -- cgit v1.2.3 From 4037a7747d7b5a3e5bb4d10fb9ea6e2fd8a23c3b Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 8 Sep 2016 18:07:07 +0200 Subject: Bluetooth: Increase the subsystem minor version number While the subsystem version information are purely informational, increase the minor number due to the addition of user channel and management control monitoring suppport. It is helpful for debugging purposes to see the version numbers change. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index d705bcf40710..0a1e21d7bce1 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -30,7 +30,7 @@ #include #define BT_SUBSYS_VERSION 2 -#define BT_SUBSYS_REVISION 21 +#define BT_SUBSYS_REVISION 22 #ifndef AF_BLUETOOTH #define AF_BLUETOOTH 31 -- cgit v1.2.3 From c4960ecf2b09210930964ef2c05ce2590802ccf4 Mon Sep 17 00:00:00 2001 From: MichaÅ‚ Narajowski Date: Sun, 18 Sep 2016 12:50:03 +0200 Subject: Bluetooth: Add support for appearance in scan rsp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch enables prepending appearance value to scan response data. It also adds support for setting appearance value through mgmt command. If currently advertised instance has apperance flag set it is expired immediately. Signed-off-by: MichaÅ‚ Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 6 ++++++ net/bluetooth/hci_request.c | 8 ++++++++ net/bluetooth/mgmt.c | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a48f71d73dc8..f00bf667ec33 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -211,6 +211,7 @@ struct hci_dev { __u8 dev_name[HCI_MAX_NAME_LENGTH]; __u8 short_name[HCI_MAX_SHORT_NAME_LENGTH]; __u8 eir[HCI_MAX_EIR_LENGTH]; + __u16 appearance; __u8 dev_class[3]; __u8 major_class; __u8 minor_class; diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 611b243713ea..72a456bbbcd5 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -598,6 +598,12 @@ struct mgmt_rp_read_ext_info { __u8 eir[0]; } __packed; +#define MGMT_OP_SET_APPEARANCE 0x0043 +struct mgmt_cp_set_appearance { + __u16 appearance; +} __packed; +#define MGMT_SET_APPEARANCE_SIZE 2 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 0ce6cdd278b2..c8135680c43e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1015,6 +1015,14 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, instance_flags = adv_instance->flags; + if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) { + ptr[0] = 3; + ptr[1] = EIR_APPEARANCE; + put_unaligned_le16(hdev->appearance, ptr + 2); + scan_rsp_len += 4; + ptr += 4; + } + memcpy(ptr, adv_instance->scan_rsp_data, adv_instance->scan_rsp_len); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 89954bb19222..78d708851208 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -105,6 +105,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_GET_ADV_SIZE_INFO, MGMT_OP_START_LIMITED_DISCOVERY, MGMT_OP_READ_EXT_INFO, + MGMT_OP_SET_APPEARANCE, }; static const u16 mgmt_events[] = { @@ -3143,6 +3144,34 @@ failed: return err; } +static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_set_appearance *cp = data; + u16 apperance; + int err; + + BT_DBG(""); + + apperance = le16_to_cpu(cp->appearance); + + hci_dev_lock(hdev); + + if (hdev->appearance != apperance) { + hdev->appearance = apperance; + + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE); + } + + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL, + 0); + + hci_dev_unlock(hdev); + + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -5918,6 +5947,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_DISCOV; flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; + flags |= MGMT_ADV_FLAG_APPEARANCE; flags |= MGMT_ADV_FLAG_LOCAL_NAME; if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) @@ -5999,6 +6029,9 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, /* at least 1 byte of name should fit in */ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) max_len -= 3; + + if (adv_flags & MGMT_ADV_FLAG_APPEARANCE) + max_len -= 4; } if (len > max_len) @@ -6335,6 +6368,9 @@ static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) /* at least 1 byte of name should fit in */ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) max_len -= 3; + + if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE)) + max_len -= 4; } return max_len; @@ -6470,6 +6506,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, HCI_MGMT_UNTRUSTED }, + { set_appearance, MGMT_SET_APPEARANCE_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From 6a5d58b67e205f2ffc62d0a9ee4ef7d237e9a7fb Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 18 Sep 2016 07:31:42 -0400 Subject: net sched ife action: add 16 bit helpers encoder and checker for 16 bits metadata Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 2 ++ net/sched/act_ife.c | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 5164bd7a38fb..9fd2bea0a6e0 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -50,9 +50,11 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp); int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp); int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi); +int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi); int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi); int ife_validate_meta_u32(void *val, int len); int ife_validate_meta_u16(void *val, int len); +int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi); void ife_release_meta_gen(struct tcf_meta_info *mi); int register_ife_op(struct tcf_meta_ops *mops); int unregister_ife_op(struct tcf_meta_ops *mops); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index e87cd81315e1..ccf7b4b655fe 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) } EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); +int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) +{ + u16 edata = 0; + + if (mi->metaval) + edata = *(u16 *)mi->metaval; + else if (metaval) + edata = metaval; + + if (!edata) /* will not encode */ + return 0; + + edata = htons(edata); + return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); +} +EXPORT_SYMBOL_GPL(ife_encode_meta_u16); + int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) @@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) } EXPORT_SYMBOL_GPL(ife_check_meta_u32); +int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) +{ + if (metaval || mi->metaval) + return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ + + return 0; +} +EXPORT_SYMBOL_GPL(ife_check_meta_u16); + int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; -- cgit v1.2.3 From 408fbc22ef1efb00dd896acd00e9f7d9b641e047 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 18 Sep 2016 07:31:43 -0400 Subject: net sched ife action: Introduce skb tcindex metadata encap decap Sample use case of how this is encoded: user space via tuntap (or a connected VM/Machine/container) encodes the tcindex TLV. Sample use case of decoding: IFE action decodes it and the skb->tc_index is then used to classify. So something like this for encoded ICMP packets: .. first decode then reclassify... skb->tcindex will be set sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xbeef \ u32 match u32 0 0 flowid 1:1 \ action ife decode reclassify ...next match the decode icmp packet... sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \ u32 match ip protocol 1 0xff flowid 1:1 \ action continue ... last classify it using the tcindex classifier and do someaction.. sudo $TC filter add dev $ETH parent ffff: prio 5 protocol ip \ handle 0x11 tcindex classid 1:1 \ action blah.. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_ife.h | 3 +- net/sched/Kconfig | 5 +++ net/sched/Makefile | 1 + net/sched/act_meta_skbtcindex.c | 79 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 net/sched/act_meta_skbtcindex.c (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h index 4ece02a77b9a..cd18360eca24 100644 --- a/include/uapi/linux/tc_act/tc_ife.h +++ b/include/uapi/linux/tc_act/tc_ife.h @@ -32,8 +32,9 @@ enum { #define IFE_META_HASHID 2 #define IFE_META_PRIO 3 #define IFE_META_QMAP 4 +#define IFE_META_TCINDEX 5 /*Can be overridden at runtime by module option*/ -#define __IFE_META_MAX 5 +#define __IFE_META_MAX 6 #define IFE_META_MAX (__IFE_META_MAX - 1) #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7795d5a3f79a..87956a768d1b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -793,6 +793,11 @@ config NET_IFE_SKBPRIO depends on NET_ACT_IFE ---help--- +config NET_IFE_SKBTCINDEX + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE + ---help--- + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index 148ae0d5ac2c..4bdda3634e0b 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o +obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c new file mode 100644 index 000000000000..3b35774ce890 --- /dev/null +++ b/net/sched/act_meta_skbtcindex.c @@ -0,0 +1,79 @@ +/* + * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2016) + * +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifetc_index = skb->tc_index; + + return ife_encode_meta_u16(ifetc_index, skbdata, e); +} + +static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len) +{ + u16 ifetc_index = *(u16 *)data; + + skb->tc_index = ntohs(ifetc_index); + return 0; +} + +static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u16(skb->tc_index, e); +} + +static struct tcf_meta_ops ife_skbtcindex_ops = { + .metaid = IFE_META_TCINDEX, + .metatype = NLA_U16, + .name = "tc_index", + .synopsis = "skb tc_index 16 bit metadata", + .check_presence = skbtcindex_check, + .encode = skbtcindex_encode, + .decode = skbtcindex_decode, + .get = ife_get_meta_u16, + .alloc = ife_alloc_meta_u16, + .release = ife_release_meta_gen, + .validate = ife_validate_meta_u16, + .owner = THIS_MODULE, +}; + +static int __init ifetc_index_init_module(void) +{ + return register_ife_op(&ife_skbtcindex_ops); +} + +static void __exit ifetc_index_cleanup_module(void) +{ + unregister_ife_op(&ife_skbtcindex_ops); +} + +module_init(ifetc_index_init_module); +module_exit(ifetc_index_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2016)"); +MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); -- cgit v1.2.3 From ca26893f05e86497a86732768ec53cd38c0819ca Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 19 Sep 2016 19:00:09 +0800 Subject: rhashtable: Add rhlist interface The insecure_elasticity setting is an ugly wart brought out by users who need to insert duplicate objects (that is, distinct objects with identical keys) into the same table. In fact, those users have a much bigger problem. Once those duplicate objects are inserted, they don't have an interface to find them (unless you count the walker interface which walks over the entire table). Some users have resorted to doing a manual walk over the hash table which is of course broken because they don't handle the potential existence of multiple hash tables. The result is that they will break sporadically when they encounter a hash table resize/rehash. This patch provides a way out for those users, at the expense of an extra pointer per object. Essentially each object is now a list of objects carrying the same key. The hash table will only see the lists so nothing changes as far as rhashtable is concerned. To use this new interface, you need to insert a struct rhlist_head into your objects instead of struct rhash_head. While the hash table is unchanged, for type-safety you'll need to use struct rhltable instead of struct rhashtable. All the existing interfaces have been duplicated for rhlist, including the hash table walker. One missing feature is nulls marking because AFAIK the only potential user of it does not need duplicate objects. Should anyone need this it shouldn't be too hard to add. Signed-off-by: Herbert Xu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 491 ++++++++++++++++++++++++++++++++++----------- lib/rhashtable.c | 258 +++++++++++++++++++----- 2 files changed, 583 insertions(+), 166 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index fd82584acd48..5c132d3188be 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,7 +1,7 @@ /* * Resizable, Scalable, Concurrent Hash Table * - * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2015-2016 Herbert Xu * Copyright (c) 2014-2015 Thomas Graf * Copyright (c) 2008-2014 Patrick McHardy * @@ -53,6 +53,11 @@ struct rhash_head { struct rhash_head __rcu *next; }; +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -137,6 +142,7 @@ struct rhashtable_params { * @key_len: Key length for hashfn * @elasticity: Maximum chain length before rehash * @p: Configuration parameters + * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list @@ -147,11 +153,20 @@ struct rhashtable { unsigned int key_len; unsigned int elasticity; struct rhashtable_params p; + bool rhlist; struct work_struct run_work; struct mutex mutex; spinlock_t lock; }; +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + /** * struct rhashtable_walker - Hash table walker * @list: List entry on list of walkers @@ -163,9 +178,10 @@ struct rhashtable_walker { }; /** - * struct rhashtable_iter - Hash table iterator, fits into netlink cb + * struct rhashtable_iter - Hash table iterator * @ht: Table to iterate through * @p: Current pointer + * @list: Current hash list pointer * @walker: Associated rhashtable walker * @slot: Current slot * @skip: Number of entries to skip in slot @@ -173,6 +189,7 @@ struct rhashtable_walker { struct rhashtable_iter { struct rhashtable *ht; struct rhash_head *p; + struct rhlist_head *list; struct rhashtable_walker walker; unsigned int slot; unsigned int skip; @@ -339,13 +356,11 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *old_tbl, - void **data); -int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); @@ -507,6 +522,31 @@ void rhashtable_destroy(struct rhashtable *ht); rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ tbl, hash, member) +/** + * rhl_for_each_rcu - iterate over rcu hash table list + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_rcu(pos, list) \ + for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + +/** + * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * @member: name of the &struct rlist_head within the hashable struct. + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) + static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { @@ -516,18 +556,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } -/** - * rhashtable_lookup_fast - search hash table, inlined version - * @ht: hash table - * @key: the pointer to the key - * @params: hash table parameters - * - * Computes the hash value for the key and traverses the bucket chain looking - * for a entry with an identical key. The first matching entry is returned. - * - * Returns the first entry on which the compare function returned true. - */ -static inline void *rhashtable_lookup_fast( +/* Internal function, do not use. */ +static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -539,8 +569,6 @@ static inline void *rhashtable_lookup_fast( struct rhash_head *he; unsigned int hash; - rcu_read_lock(); - tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); @@ -549,8 +577,7 @@ restart: params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; - rcu_read_unlock(); - return rht_obj(ht, he); + return he; } /* Ensure we see any new tables. */ @@ -559,96 +586,165 @@ restart: tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; - rcu_read_unlock(); return NULL; } +/** + * rhashtable_lookup - search hash table + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * This must only be called under the RCU read lock. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params); + + return he ? rht_obj(ht, he) : NULL; +} + +/** + * rhashtable_lookup_fast - search hash table, without RCU read lock + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * Only use this function when you have other mechanisms guaranteeing + * that the object won't go away after the RCU read lock is released. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup_fast( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + void *obj; + + rcu_read_lock(); + obj = rhashtable_lookup(ht, key, params); + rcu_read_unlock(); + + return obj; +} + +/** + * rhltable_lookup - search hash list table + * @hlt: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. All matching entries are returned + * in a list. + * + * This must only be called under the RCU read lock. + * + * Returns the list of entries that match the given key. + */ +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + + return he ? container_of(he, struct rhlist_head, rhead) : NULL; +} + /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes in there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, - const struct rhashtable_params params) + const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; - struct bucket_table *tbl, *new_tbl; + struct rhash_head __rcu **pprev; + struct bucket_table *tbl; struct rhash_head *head; spinlock_t *lock; - unsigned int elasticity; unsigned int hash; - void *data = NULL; - int err; + int elasticity; + void *data; -restart: rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); + hash = rht_head_hashfn(ht, tbl, obj, params); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); - /* All insertions must grab the oldest table containing - * the hashed bucket that is yet to be rehashed. - */ - for (;;) { - hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); - spin_lock_bh(lock); - - if (tbl->rehash <= hash) - break; - + if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) { +slow_path: spin_unlock_bh(lock); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); + rcu_read_unlock(); + return rhashtable_insert_slow(ht, key, obj); } - new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (unlikely(new_tbl)) { - tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data); - if (!IS_ERR_OR_NULL(tbl)) - goto slow_path; + elasticity = ht->elasticity; + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *plist; + struct rhlist_head *list; + + elasticity--; + if (!key || + (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; + + data = rht_obj(ht, head); - err = PTR_ERR(tbl); - if (err == -EEXIST) - err = 0; + if (!rhlist) + goto out; - goto out; - } - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto out; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - if (unlikely(rht_grow_above_100(ht, tbl))) { -slow_path: - spin_unlock_bh(lock); - err = rhashtable_insert_rehash(ht, tbl); - rcu_read_unlock(); - if (err) - return ERR_PTR(err); + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); - goto restart; + goto good; } - err = 0; - elasticity = ht->elasticity; - rht_for_each(head, tbl, hash) { - if (key && - unlikely(!(params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head))))) { - data = rht_obj(ht, head); - goto out; - } - if (!--elasticity) - goto slow_path; - } + if (elasticity <= 0) + goto slow_path; + + data = ERR_PTR(-E2BIG); + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out; + + if (unlikely(rht_grow_above_100(ht, tbl))) + goto slow_path; head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } rcu_assign_pointer(tbl->buckets[hash], obj); @@ -656,11 +752,14 @@ slow_path: if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); +good: + data = NULL; + out: spin_unlock_bh(lock); rcu_read_unlock(); - return err ? ERR_PTR(err) : data; + return data; } /** @@ -685,13 +784,65 @@ static inline int rhashtable_insert_fast( { void *ret; - ret = __rhashtable_insert_fast(ht, NULL, obj, params); + ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } +/** + * rhltable_insert_key - insert object into hash list table + * @hlt: hash list table + * @key: the pointer to the key + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take a per bucket spinlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket lock. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if the size grows + * beyond the watermark indicated by grow_decision() which can be passed + * to rhashtable_init(). + */ +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params, true)); +} + +/** + * rhltable_insert - insert object into hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take a per bucket spinlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket lock. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if the size grows + * beyond the watermark indicated by grow_decision() which can be passed + * to rhashtable_init(). + */ +static inline int rhltable_insert( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + const char *key = rht_obj(&hlt->ht, &list->rhead); + + key += params.key_offset; + + return rhltable_insert_key(hlt, key, list, params); +} + /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table @@ -722,7 +873,8 @@ static inline int rhashtable_lookup_insert_fast( BUG_ON(ht->p.obj_hashfn); - ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params); + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); if (IS_ERR(ret)) return PTR_ERR(ret); @@ -759,7 +911,7 @@ static inline int rhashtable_lookup_insert_key( BUG_ON(!ht->p.obj_hashfn || !key); - ret = __rhashtable_insert_fast(ht, key, obj, params); + ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); @@ -783,13 +935,14 @@ static inline void *rhashtable_lookup_get_insert_key( { BUG_ON(!ht->p.obj_hashfn || !key); - return __rhashtable_insert_fast(ht, key, obj, params); + return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast( +static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, - struct rhash_head *obj, const struct rhashtable_params params) + struct rhash_head *obj, const struct rhashtable_params params, + bool rhlist) { struct rhash_head __rcu **pprev; struct rhash_head *he; @@ -804,39 +957,66 @@ static inline int __rhashtable_remove_fast( pprev = &tbl->buckets[hash]; rht_for_each(he, tbl, hash) { + struct rhlist_head *list; + + list = container_of(he, struct rhlist_head, rhead); + if (he != obj) { + struct rhlist_head __rcu **lpprev; + pprev = &he->next; - continue; + + if (!rhlist) + continue; + + do { + lpprev = &list->next; + list = rht_dereference_bucket(list->next, + tbl, hash); + } while (list && obj != &list->rhead); + + if (!list) + continue; + + list = rht_dereference_bucket(list->next, tbl, hash); + RCU_INIT_POINTER(*lpprev, list); + err = 0; + break; } - rcu_assign_pointer(*pprev, obj->next); - err = 0; + obj = rht_dereference_bucket(obj->next, tbl, hash); + err = 1; + + if (rhlist) { + list = rht_dereference_bucket(list->next, tbl, hash); + if (list) { + RCU_INIT_POINTER(list->rhead.next, obj); + obj = &list->rhead; + err = 0; + } + } + + rcu_assign_pointer(*pprev, obj); break; } spin_unlock_bh(lock); + if (err > 0) { + atomic_dec(&ht->nelems); + if (unlikely(ht->p.automatic_shrinking && + rht_shrink_below_30(ht, tbl))) + schedule_work(&ht->run_work); + err = 0; + } + return err; } -/** - * rhashtable_remove_fast - remove object from hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerable slow if the hash table is not correctly sized. - * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. - * - * Returns zero on success, -ENOENT if the entry could not be found. - */ -static inline int rhashtable_remove_fast( +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params) + const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; @@ -850,24 +1030,60 @@ static inline int rhashtable_remove_fast( * visible then that guarantees the entry to still be in * the old tbl if it exists. */ - while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) && + while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, + rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; - if (err) - goto out; - - atomic_dec(&ht->nelems); - if (unlikely(ht->p.automatic_shrinking && - rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); - -out: rcu_read_unlock(); return err; } +/** + * rhashtable_remove_fast - remove object from hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table via rhashtable_expand() if the + * shrink_decision function specified at rhashtable_init() returns true. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhashtable_remove_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(ht, obj, params, false); +} + +/** + * rhltable_remove - remove object from hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table via rhashtable_expand() if the + * shrink_decision function specified at rhashtable_init() returns true. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); +} + /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, @@ -958,4 +1174,51 @@ static inline int rhashtable_walk_init(struct rhashtable *ht, return 0; } +/** + * rhltable_walk_enter - Initialise an iterator + * @hlt: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may sleep so you must not call it from interrupt + * context or with spin locks held. + * + * You must call rhashtable_walk_exit after this function returns. + */ +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + return rhashtable_walk_enter(&hlt->ht, iter); +} + +/** + * rhltable_free_and_destroy - free elements and destroy hash list table + * @hlt: the hash list table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * See documentation for rhashtable_free_and_destroy. + */ +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + return rhltable_free_and_destroy(hlt, NULL, NULL); +} + #endif /* _LINUX_RHASHTABLE_H */ diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 06c28728bb53..32d0ad058380 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -378,22 +378,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -439,57 +425,165 @@ fail: return err; } -EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl, - void **data) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); - - err = -EEXIST; - if (key) { - *data = rhashtable_lookup_fast(ht, key, ht->p); - if (*data) - goto exit; + elasticity = ht->elasticity; + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; + + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); + + return NULL; } - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); - err = 0; + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return NULL; +} - if (err == 0) - return NULL; - else if (err == -EAGAIN) - return tbl; - else - return ERR_PTR(err); +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); @@ -593,11 +687,16 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); void *rhashtable_walk_next(struct rhashtable_iter *iter) { struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; if (p) { - p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } goto next; } @@ -605,6 +704,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter) int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } if (!skip) break; skip--; @@ -614,7 +725,8 @@ next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; - return rht_obj(ht, p); + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; @@ -802,6 +914,48 @@ int rhashtable_init(struct rhashtable *ht, } EXPORT_SYMBOL_GPL(rhashtable_init); +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + /** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy @@ -839,7 +993,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) - free_fn(rht_obj(ht, pos), arg); + rhashtable_free_one(ht, pos, free_fn, arg); } } -- cgit v1.2.3 From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:13 +0200 Subject: bpf: direct packet write and access for helpers for clsact progs This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits 4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and 6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in 233577a22089 ("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 +- include/linux/skbuff.h | 14 ++++- include/uapi/linux/bpf.h | 21 ++++++++ kernel/bpf/helpers.c | 3 ++ kernel/bpf/verifier.c | 54 ++++++++++++++----- net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++++------ 6 files changed, 196 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9a904f63f8c1..5691fdc83819 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_return_type { struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; + bool pkt_access; enum bpf_return_type ret_type; enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; @@ -151,7 +152,8 @@ struct bpf_verifier_ops { */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type); - + int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, + const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn, struct bpf_prog *prog); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4c5662f05bda..c6dab3f7457c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,23 @@ struct sk_buff { */ kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; + +/* if you move cloned around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define CLONED_MASK (1 << 7) +#else +#define CLONED_MASK 1 +#endif +#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) + + __u8 __cloned_offset[0]; __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, - xmit_more:1; - /* one bit hole */ + xmit_more:1, + __unused:1; /* one bit hole */ kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f896dfac4ac0..e07432b9f8b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -398,6 +398,27 @@ enum bpf_func_id { */ BPF_FUNC_skb_change_tail, + /** + * bpf_skb_pull_data(skb, len) + * The helper will pull in non-linear data in case the + * skb is non-linear and not all of len are part of the + * linear section. Only needed for read/write with direct + * packet access. + * @skb: pointer to skb + * @len: len to make read/writeable + * Return: 0 on success or negative error + */ + BPF_FUNC_skb_pull_data, + + /** + * bpf_csum_update(skb, csum) + * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. + * @skb: pointer to skb + * @csum: csum to add + * Return: csum on success or negative error + */ + BPF_FUNC_csum_update, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a5b8bf8cfcfd..39918402e6e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc138f34e38c..3a75ee3bdcd1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -196,6 +196,7 @@ struct verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool seen_direct_write; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -204,6 +205,7 @@ struct verifier_env { struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; + bool pkt_access; int regno; int access_size; }; @@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_write_pkt_data(enum bpf_prog_type type) +static bool may_access_direct_pkt_data(struct verifier_env *env, + const struct bpf_call_arg_meta *meta) { - switch (type) { + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + if (meta) + return meta->pkt_access; + + env->seen_direct_write = true; return true; default: return false; @@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) { - verbose("helper access to the packet is not allowed for clsact\n"); + if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + verbose("helper access to the packet is not allowed\n"); return -EACCES; } @@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id) changes_data = bpf_helper_changes_skb_data(fn->func); memset(&meta, 0, sizeof(meta)); + meta.pkt_access = fn->pkt_access; /* We only support one arg being in raw mode at the moment, which * is sufficient for the helper functions we have right now. @@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) */ static int convert_ctx_accesses(struct verifier_env *env) { - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16]; + const struct bpf_verifier_ops *ops = env->prog->aux->ops; + struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i; + int i, insn_cnt, cnt; - if (!env->prog->aux->ops->convert_ctx_access) + if (ops->gen_prologue) { + cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, + env->prog); + if (cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (cnt) { + new_prog = bpf_patch_insn_single(env->prog, 0, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + } + } + + if (!ops->convert_ctx_access) return 0; + insn_cnt = env->prog->len; + insn = env->prog->insnsi; + for (i = 0; i < insn_cnt; i++, insn++) { - u32 insn_delta, cnt; + u32 insn_delta; if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) @@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env) continue; } - cnt = env->prog->aux->ops-> - convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index 298b146b47e7..0920c2ac1d00 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, return err; } +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) @@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_STACK, .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, @@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; + struct sk_buff *clone; + int ret; if (unlikely(flags & ~(BPF_F_INGRESS))) return -EINVAL; @@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) if (unlikely(!dev)) return -EINVAL; - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - bpf_push_mac_rcsum(skb); + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } + + bpf_push_mac_rcsum(clone); return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { bool bpf_helper_changes_skb_data(void *func) { - if (func == bpf_skb_vlan_push) - return true; - if (func == bpf_skb_vlan_pop) - return true; - if (func == bpf_skb_store_bytes) - return true; - if (func == bpf_skb_change_proto) - return true; - if (func == bpf_skb_change_tail) - return true; - if (func == bpf_l3_csum_replace) - return true; - if (func == bpf_l4_csum_replace) + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_tail || + func == bpf_skb_pull_data || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace) return true; return false; @@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) @@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, }; static const struct bpf_verifier_ops xdp_ops = { -- cgit v1.2.3 From a4f1f9ac8153e22869b6408832b5a9bb9c762bf6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:09 -0400 Subject: lib/win_minmax: windowed min or max estimator This commit introduces a generic library to estimate either the min or max value of a time-varying variable over a recent time window. This is code originally from Kathleen Nichols. The current form of the code is from Van Jacobson. A single struct minmax_sample will track the estimated windowed-max value of the series if you call minmax_running_max() or the estimated windowed-min value of the series if you call minmax_running_min(). Nearly equivalent code is already in place for minimum RTT estimation in the TCP stack. This commit extracts that code and generalizes it to handle both min and max. Moving the code here reduces the footprint and complexity of the TCP code base and makes the filter generally available for other parts of the codebase, including an upcoming TCP congestion control module. This library works well for time series where the measurements are smoothly increasing or decreasing. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/win_minmax.h | 37 +++++++++++++++++ lib/Makefile | 2 +- lib/win_minmax.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 include/linux/win_minmax.h create mode 100644 lib/win_minmax.c (limited to 'include') diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h new file mode 100644 index 000000000000..56569604278f --- /dev/null +++ b/include/linux/win_minmax.h @@ -0,0 +1,37 @@ +/** + * lib/minmax.c: windowed min/max tracker by Kathleen Nichols. + * + */ +#ifndef MINMAX_H +#define MINMAX_H + +#include + +/* A single data point for our parameterized min-max tracker */ +struct minmax_sample { + u32 t; /* time measurement was taken */ + u32 v; /* value measured */ +}; + +/* State for the parameterized min-max tracker */ +struct minmax { + struct minmax_sample s[3]; +}; + +static inline u32 minmax_get(const struct minmax *m) +{ + return m->s[0].v; +} + +static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + m->s[2] = m->s[1] = m->s[0] = val; + return m->s[0].v; +} + +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas); +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas); + +#endif diff --git a/lib/Makefile b/lib/Makefile index 5dc77a8ec297..df747e5eeb7a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/win_minmax.c b/lib/win_minmax.c new file mode 100644 index 000000000000..c8420d404926 --- /dev/null +++ b/lib/win_minmax.c @@ -0,0 +1,98 @@ +/** + * lib/minmax.c: windowed min/max tracker + * + * Kathleen Nichols' algorithm for tracking the minimum (or maximum) + * value of a data stream over some fixed time interval. (E.g., + * the minimum RTT over the past five minutes.) It uses constant + * space and constant time per update yet almost always delivers + * the same minimum as an implementation that has to keep all the + * data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of + * the n'th best >= n-1'th best. It also makes sure that the three + * values are widely separated in the time window since that bounds + * the worse case error when that data is monotonically increasing + * over the window. + * + * Upon getting a new min, we can forget everything earlier because + * it has no value - the new min is <= everything else in the window + * by definition and it's the most recent. So we restart fresh on + * every new min and overwrites 2nd & 3rd choices. The same property + * holds for 2nd & 3rd best. + */ +#include +#include + +/* As time advances, update the 1st, 2nd, and 3rd choices. */ +static u32 minmax_subwin_update(struct minmax *m, u32 win, + const struct minmax_sample *val) +{ + u32 dt = val->t - m->s[0].t; + + if (unlikely(dt > win)) { + /* + * Passed entire window without a new val so make 2nd + * choice the new val & 3rd choice the new 2nd choice. + * we may have to iterate this since our 2nd choice + * may also be outside the window (we checked on entry + * that the third choice was in the window). + */ + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + if (unlikely(val->t - m->s[0].t > win)) { + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + } + } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { + /* + * We've passed a quarter of the window without a new val + * so take a 2nd choice from the 2nd quarter of the window. + */ + m->s[2] = m->s[1] = *val; + } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { + /* + * We've passed half the window without finding a new val + * so take a 3rd choice from the last half of the window + */ + m->s[2] = *val; + } + return m->s[0].v; +} + +/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v >= m->s[0].v) || /* found new max? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v >= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v >= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} +EXPORT_SYMBOL(minmax_running_max); + +/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v <= m->s[0].v) || /* found new min? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v <= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v <= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} -- cgit v1.2.3 From 6403389211e1f4d40ed963fe47a96fce1a3ba7a9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:10 -0400 Subject: tcp: use windowed min filter library for TCP min_rtt estimation Refactor the TCP min_rtt code to reuse the new win_minmax library in lib/win_minmax.c to simplify the TCP code. This is a pure refactor: the functionality is exactly the same. We just moved the windowed min code to make TCP easier to read and maintain, and to allow other parts of the kernel to use the windowed min/max filter code. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 ++-- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 64 ++++-------------------------------------------- net/ipv4/tcp_minisocks.c | 2 +- 5 files changed, 10 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c723a465125d..6433cc8b4667 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -234,9 +235,7 @@ struct tcp_sock { u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ - struct rtt_meas { - u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */ - } rtt_min[3]; + struct minmax rtt_min; u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ diff --git a/include/net/tcp.h b/include/net/tcp.h index fdfbedd61c67..2f1648af4d12 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -671,7 +671,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) /* Minimum RTT in usec. ~0 means not available. */ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) { - return tp->rtt_min[0].rtt; + return minmax_get(&tp->rtt_min); } /* Compute the actual receive window we are currently advertising. diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7dae800092e6..e79ed17ccfd6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - tp->rtt_min[0].rtt = ~0U; + minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dad3e7eeed94..6886f386464f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2879,67 +2879,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, *rexmit = REXMIT_LOST; } -/* Kathleen Nichols' algorithm for tracking the minimum value of - * a data stream over some fixed time interval. (E.g., the minimum - * RTT over the past five minutes.) It uses constant space and constant - * time per update yet almost always delivers the same minimum as an - * implementation that has to keep all the data in the window. - * - * The algorithm keeps track of the best, 2nd best & 3rd best min - * values, maintaining an invariant that the measurement time of the - * n'th best >= n-1'th best. It also makes sure that the three values - * are widely separated in the time window since that bounds the worse - * case error when that data is monotonically increasing over the window. - * - * Upon getting a new min, we can forget everything earlier because it - * has no value - the new min is <= everything else in the window by - * definition and it's the most recent. So we restart fresh on every new min - * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd - * best. - */ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { - const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; - struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { - .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), - .ts = now, - }; - u32 elapsed; - - /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ - if (unlikely(rttm.rtt <= m[0].rtt)) - m[0] = m[1] = m[2] = rttm; - else if (rttm.rtt <= m[1].rtt) - m[1] = m[2] = rttm; - else if (rttm.rtt <= m[2].rtt) - m[2] = rttm; - - elapsed = now - m[0].ts; - if (unlikely(elapsed > wlen)) { - /* Passed entire window without a new min so make 2nd choice - * the new min & 3rd choice the new 2nd. So forth and so on. - */ - m[0] = m[1]; - m[1] = m[2]; - m[2] = rttm; - if (now - m[0].ts > wlen) { - m[0] = m[1]; - m[1] = rttm; - if (now - m[0].ts > wlen) - m[0] = rttm; - } - } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { - /* Passed a quarter of the window without a new min so - * take 2nd choice from the 2nd quarter of the window. - */ - m[2] = m[1] = rttm; - } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { - /* Passed half the window without a new min so take the 3rd - * choice from the last half of the window. - */ - m[2] = rttm; - } + struct tcp_sock *tp = tcp_sk(sk); + u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; + + minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, + rtt_us ? : jiffies_to_usecs(1)); } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f63c73dc0acb..568947110b60 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - newtp->rtt_min[0].rtt = ~0U; + minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; -- cgit v1.2.3 From 77879147a3481babffd7e368d977ab682545a6bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Sep 2016 23:39:11 -0400 Subject: net_sched: sch_fq: add low_rate_threshold parameter This commit adds to the fq module a low_rate_threshold parameter to insert a delay after all packets if the socket requests a pacing rate below the threshold. This helps achieve more precise control of the sending rate with low-rate paths, especially policers. The basic issue is that if a congestion control module detects a policer at a certain rate, it may want fq to be able to shape to that policed rate. That way the sender can avoid policer drops by having the packets arrive at the policer at or just under the policed rate. The default threshold of 550Kbps was chosen analytically so that for policers or links at 500Kbps or 512Kbps fq would very likely invoke this mechanism, even if the pacing rate was briefly slightly above the available bandwidth. This value was then empirically validated with two years of production testing on YouTube video servers. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_fq.c | 22 +++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 2382eed50278..f8e39dbaa781 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -792,6 +792,8 @@ enum { TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + __TCA_FQ_MAX }; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index dc52cc10d6ed..5dd929cc1423 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -94,6 +94,7 @@ struct fq_sched_data { u32 flow_max_rate; /* optional max rate per flow */ u32 flow_plimit; /* max packets per flow */ u32 orphan_mask; /* mask for orphaned skb */ + u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; @@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; - u32 rate; + u32 rate, plen; skb = fq_dequeue_head(sch, &q->internal); if (skb) @@ -482,7 +483,7 @@ begin: prefetch(&skb->end); f->credit -= qdisc_pkt_len(skb); - if (f->credit > 0 || !q->rate_enable) + if (!q->rate_enable) goto out; /* Do not pace locally generated ack packets */ @@ -493,8 +494,15 @@ begin: if (skb->sk) rate = min(skb->sk->sk_pacing_rate, rate); + if (rate <= q->low_rate_threshold) { + f->credit = 0; + plen = qdisc_pkt_len(skb); + } else { + plen = max(qdisc_pkt_len(skb), q->quantum); + if (f->credit > 0) + goto out; + } if (rate != ~0U) { - u32 plen = max(qdisc_pkt_len(skb), q->quantum); u64 len = (u64)plen * NSEC_PER_SEC; if (likely(rate)) @@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_FLOW_MAX_RATE]) q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) + q->low_rate_threshold = + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); @@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->fq_root = NULL; q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; + q->low_rate_threshold = 550000 / 8; qdisc_watchdog_init(&q->watchdog, sch); if (opt) @@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, jiffies_to_usecs(q->flow_refill_delay)) || nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, + q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; -- cgit v1.2.3 From 0682e6902a52aca7caf6ad42551b16ea0f87bc31 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:13 -0400 Subject: tcp: count packets marked lost for a TCP connection Count the number of packets that a TCP connection marks lost. Congestion control modules can use this loss rate information for more intelligent decisions about how fast to send. Specifically, this is used in TCP BBR policer detection. BBR uses a high packet loss rate as one signal in its policer detection and policer bandwidth estimation algorithm. The BBR policer detection algorithm cannot simply track retransmits, because a retransmit can be (and often is) an indicator of packets lost long, long ago. This is particularly true in a long CA_Loss period that repairs the initial massive losses when a policer kicks in. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6433cc8b4667..38590fbc0ac5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -267,6 +267,7 @@ struct tcp_sock { * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 lost; /* Total data packets lost incl. rexmits */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6886f386464f..9413288c2778 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -899,12 +899,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } +/* Sum the number of packets on the wire we have marked as lost. + * There are two cases we care about here: + * a) Packet hasn't been marked lost (nor retransmitted), + * and this is the first loss. + * b) Packet has been marked both lost and retransmitted, + * and this means we think it was lost again. + */ +static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) +{ + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if (!(sacked & TCPCB_LOST) || + ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) + tp->lost += tcp_skb_pcount(skb); +} + static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) { if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tcp_verify_retransmit_hint(tp, skb); tp->lost_out += tcp_skb_pcount(skb); + tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; } } @@ -913,6 +930,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); + tcp_sum_lost(tp, skb); if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1890,6 +1908,7 @@ void tcp_enter_loss(struct sock *sk) struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ + bool mark_lost; /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1923,8 +1942,12 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; + mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || + is_reneg); + if (mark_lost) + tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { + if (mark_lost) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); -- cgit v1.2.3 From b9f64820fb226a4e8ab10591f46cecd91ca56b30 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:14 -0400 Subject: tcp: track data delivery rate for a TCP connection This patch generates data delivery rate (throughput) samples on a per-ACK basis. These rate samples can be used by congestion control modules, and specifically will be used by TCP BBR in later patches in this series. Key state: tp->delivered: Tracks the total number of data packets (original or not) delivered so far. This is an already-existing field. tp->delivered_mstamp: the last time tp->delivered was updated. Algorithm: A rate sample is calculated as (d1 - d0)/(t1 - t0) on a per-ACK basis: d1: the current tp->delivered after processing the ACK t1: the current time after processing the ACK d0: the prior tp->delivered when the acked skb was transmitted t0: the prior tp->delivered_mstamp when the acked skb was transmitted When an skb is transmitted, we snapshot d0 and t0 in its control block in tcp_rate_skb_sent(). When an ACK arrives, it may SACK and ACK some skbs. For each SACKed or ACKed skb, tcp_rate_skb_delivered() updates the rate_sample struct to reflect the latest (d0, t0). Finally, tcp_rate_gen() generates a rate sample by storing (d1 - d0) in rs->delivered and (t1 - t0) in rs->interval_us. One caveat: if an skb was sent with no packets in flight, then tp->delivered_mstamp may be either invalid (if the connection is starting) or outdated (if the connection was idle). In that case, we'll re-stamp tp->delivered_mstamp. At first glance it seems t0 should always be the time when an skb was transmitted, but actually this could over-estimate the rate due to phase mismatch between transmit and ACK events. To track the delivery rate, we ensure that if packets are in flight then t0 and and t1 are times at which packets were marked delivered. If the initial and final RTTs are different then one may be corrupted by some sort of noise. The noise we see most often is sending gaps caused by delayed, compressed, or stretched acks. This either affects both RTTs equally or artificially reduces the final RTT. We approach this by recording the info we need to compute the initial RTT (duration of the "send phase" of the window) when we recorded the associated inflight. Then, for a filter to avoid bandwidth overestimates, we generalize the per-sample bandwidth computation from: bw = delivered / ack_phase_rtt to the following: bw = delivered / max(send_phase_rtt, ack_phase_rtt) In large-scale experiments, this filtering approach incorporating send_phase_rtt is effective at avoiding bandwidth overestimates due to ACK compression or stretched ACKs. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 + include/net/tcp.h | 35 +++++++++++- net/ipv4/Makefile | 2 +- net/ipv4/tcp_input.c | 46 +++++++++++----- net/ipv4/tcp_output.c | 4 ++ net/ipv4/tcp_rate.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 222 insertions(+), 16 deletions(-) create mode 100644 net/ipv4/tcp_rate.c (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 38590fbc0ac5..c50e6aec005a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,8 @@ struct tcp_sock { u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 lost; /* Total data packets lost incl. rexmits */ + struct skb_mstamp first_tx_mstamp; /* start of window send phase */ + struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 2f1648af4d12..b261c892605a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,8 +763,14 @@ struct tcp_skb_cb { __u32 ack_seq; /* Sequence number ACK'd */ union { struct { - /* There is space for up to 20 bytes */ + /* There is space for up to 24 bytes */ __u32 in_flight;/* Bytes in flight when packet sent */ + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ + struct skb_mstamp first_tx_mstamp; + /* when we reached the "delivered" count */ + struct skb_mstamp delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -860,6 +866,26 @@ struct ack_sample { u32 in_flight; }; +/* A rate sample measures the number of (original/retransmitted) data + * packets delivered "delivered" over an interval of time "interval_us". + * The tcp_rate.c code fills in the rate sample, and congestion + * control modules that define a cong_control function to run at the end + * of ACK processing can optionally chose to consult this sample when + * setting cwnd and pacing rate. + * A sample is invalid if "delivered" or "interval_us" is negative. + */ +struct rate_sample { + struct skb_mstamp prior_mstamp; /* starting timestamp for interval */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + s32 delivered; /* number of packets delivered over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ + int losses; /* number of packets marked lost upon ACK */ + u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ + u32 prior_in_flight; /* in flight before this ACK */ + bool is_retrans; /* is sample from retransmission? */ +}; + struct tcp_congestion_ops { struct list_head list; u32 key; @@ -946,6 +972,13 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* From tcp_rate.c */ +void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); +void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + struct skb_mstamp *now, struct rate_sample *rs); + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 24629b6f57cc..9cfff1a0bf71 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_recovery.o \ + tcp_rate.o tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9413288c2778..d9ed4bb96f74 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1112,6 +1112,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct rate_sample *rate; int flag; }; @@ -1279,6 +1280,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, &skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1329,6 +1331,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); + if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) + TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; + tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1558,6 +1563,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, dup_sack, tcp_skb_pcount(skb), &skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1640,8 +1646,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); - if (found_dup_sack) + if (found_dup_sack) { state->flag |= FLAG_DSACKING_ACK; + tp->delivered++; /* A spurious retransmission is delivered */ + } /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -3071,10 +3079,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, + struct skb_mstamp *now) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct skb_mstamp first_ackt, last_ackt, now; + struct skb_mstamp first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3106,7 +3115,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, acked_pcount = tcp_tso_acked(sk, skb); if (!acked_pcount) break; - fully_acked = false; } else { /* Speedup tcp_unlink_write_queue() and next loop */ @@ -3142,6 +3150,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; + tcp_rate_skb_delivered(sk, skb, sack->rate); /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -3174,16 +3183,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - skb_mstamp_get(&now); if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { - seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); + ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); } if (sack->first_sackt.v64) { - sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); - ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); + sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); } - + sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, ca_rtt_us); @@ -3211,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { + sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3548,17 +3556,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_sacktag_state sack_state; + struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - u32 prior_delivered = tp->delivered; + u32 delivered = tp->delivered; + u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + struct skb_mstamp now; sack_state.first_sackt.v64 = 0; + sack_state.rate = &rs; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3581,6 +3593,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + skb_mstamp_get(&now); + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -3591,6 +3605,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) } prior_fackets = tp->fackets_out; + rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3646,7 +3661,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + &sack_state, &now); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3663,7 +3678,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + lost = tp->lost - lost; /* freshly marked lost */ + tcp_rate_gen(sk, delivered, lost, &now, &rs); + tcp_cong_control(sk, ack, delivered, flag); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8b45794eb6b2..e02c8ebf3ed4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; + tcp_rate_skb_sent(sk, skb); if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); + /* Update delivered info for the new segment */ + TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; + /* If this packet has been sent out already, we must * adjust the various packet counters. */ diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c new file mode 100644 index 000000000000..1daed6af6e80 --- /dev/null +++ b/net/ipv4/tcp_rate.c @@ -0,0 +1,149 @@ +#include + +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + */ + + +/* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include + * the full time the network needs to deliver all in-flight + * packets. If there are no packets in flight yet, then we + * know that any ACKs after now indicate that the network was + * able to deliver those packets completely in the sampling + * interval between now and the next ACK. + * + * Note that we use packets_out instead of tcp_packets_in_flight(tp) + * because the latter is a guess based on RTO and loss-marking + * heuristics. We don't want spurious RTOs or loss markings to cause + * a spuriously small time interval, causing a spuriously high + * bandwidth estimate. + */ + if (!tp->packets_out) { + tp->first_tx_mstamp = skb->skb_mstamp; + tp->delivered_mstamp = skb->skb_mstamp; + } + + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; +} + +/* When an skb is sacked or acked, we fill in the rate sample with the (prior) + * delivery information when the skb was last transmitted. + * + * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is + * called multiple times. We favor the information from the most recently + * sent skb, i.e., the skb with the highest prior_delivered count. + */ +void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (!scb->tx.delivered_mstamp.v64) + return; + + if (!rs->prior_delivered || + after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + + /* Find the duration of the "send phase" of this window: */ + rs->interval_us = skb_mstamp_us_delta( + &skb->skb_mstamp, + &scb->tx.first_tx_mstamp); + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = skb->skb_mstamp; + } + /* Mark off the skb delivered once it's sacked to avoid being + * used again when it's cumulatively acked. For acked packets + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) + scb->tx.delivered_mstamp.v64 = 0; +} + +/* Update the connection delivery information and generate a rate sample. */ +void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + struct skb_mstamp *now, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = *now; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available. */ + if (!rs->prior_mstamp.v64) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + rs->interval_us = -1; + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + } +} -- cgit v1.2.3 From d7722e8570fc0f1e003cee7cf37694041828918b Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 19 Sep 2016 23:39:15 -0400 Subject: tcp: track application-limited rate samples This commit adds code to track whether the delivery rate represented by each rate_sample was limited by the application. Upon each transmit, we store in the is_app_limited field in the skb a boolean bit indicating whether there is a known "bubble in the pipe": a point in the rate sample interval where the sender was application-limited, and did not transmit even though the cwnd and pacing rate allowed it. This logic marks the flow app-limited on a write if *all* of the following are true: 1) There is less than 1 MSS of unsent data in the write queue available to transmit. 2) There is no packet in the sender's queues (e.g. in fq or the NIC tx queue). 3) The connection is not limited by cwnd. 4) There are no lost packets to retransmit. The tcp_rate_check_app_limited() code in tcp_rate.c determines whether the connection is application-limited at the moment. If the flow is application-limited, it sets the tp->app_limited field. If the flow is application-limited then that means there is effectively a "bubble" of silence in the pipe now, and this silence will be reflected in a lower bandwidth sample for any rate samples from now until we get an ACK indicating this bubble has exited the pipe: specifically, until we get an ACK for the next packet we transmit. When we send every skb we record in scb->tx.is_app_limited whether the resulting rate sample will be application-limited. The code in tcp_rate_gen() checks to see when it is safe to mark all known application-limited bubbles of silence as having exited the pipe. It does this by checking to see when the delivered count moves past the tp->app_limited marker. At this point it zeroes the tp->app_limited marker, as all known bubbles are out of the pipe. We make room for the tx.is_app_limited bit in the skb by borrowing a bit from the in_flight field used by NV to record the number of bytes in flight. The receive window in the TCP header is 16 bits, and the max receive window scaling shift factor is 14 (RFC 1323). So the max receive window offered by the TCP protocol is 2^(16+14) = 2^30. So we only need 30 bits for the tx.in_flight used by NV. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/net/tcp.h | 6 +++++- net/ipv4/tcp.c | 8 ++++++++ net/ipv4/tcp_minisocks.c | 3 +++ net/ipv4/tcp_rate.c | 29 ++++++++++++++++++++++++++++- 5 files changed, 45 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c50e6aec005a..fdcd00ffcb66 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,7 @@ struct tcp_sock { u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 lost; /* Total data packets lost incl. rexmits */ + u32 app_limited; /* limited until "delivered" reaches this val */ struct skb_mstamp first_tx_mstamp; /* start of window send phase */ struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ diff --git a/include/net/tcp.h b/include/net/tcp.h index b261c892605a..a69ed7f0030c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -764,7 +764,9 @@ struct tcp_skb_cb { union { struct { /* There is space for up to 24 bytes */ - __u32 in_flight;/* Bytes in flight when packet sent */ + __u32 in_flight:30,/* Bytes in flight at transmit */ + is_app_limited:1, /* cwnd not fully used? */ + unused:1; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -883,6 +885,7 @@ struct rate_sample { int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ }; @@ -978,6 +981,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, struct skb_mstamp *now, struct rate_sample *rs); +void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de02fb4b1349..2250f891f931 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk) */ tp->snd_cwnd = TCP_INIT_CWND; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ @@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); + + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + res = do_tcp_sendpages(sk, page, offset, size, flags); release_sock(sk); return res; @@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 568947110b60..6234ebaa7db1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + tcp_init_xmit_timers(newsk); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 1daed6af6e80..52ff84be59ab 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -26,9 +26,13 @@ * other factors like applications or receiver window limits. The estimator * deliberately avoids using the inter-packet spacing approach because that * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. */ - /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -58,6 +62,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -80,6 +85,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, after(scb->tx.delivered, rs->prior_delivered)) { rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; /* Find the duration of the "send phase" of this window: */ @@ -105,6 +111,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + /* TODO: there are multiple places throughout tcp_ack() to get * current time. Refactor the code using a new "tcp_acktag_state" * to carry current time, flags, stats like "tcp_sacktag_state". @@ -147,3 +157,20 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); } } + +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tp->snd_cwnd && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} -- cgit v1.2.3 From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:16 -0400 Subject: tcp: export data delivery rate This commit export two new fields in struct tcp_info: tcpi_delivery_rate: The most recent goodput, as measured by tcp_rate_gen(). If the socket is limited by the sending application (e.g., no data to send), it reports the highest measurement instead of the most recent. The unit is bytes per second (like other rate fields in tcp_info). tcpi_delivery_rate_app_limited: A boolean indicating if the goodput was measured when the socket's throughput was limited by the sending application. This delivery rate information can be useful for applications that want to know the current throughput the TCP connection is seeing, e.g. adaptive bitrate video streaming. It can also be very useful for debugging or troubleshooting. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 ++++- include/uapi/linux/tcp.h | 3 +++ net/ipv4/tcp.c | 11 ++++++++++- net/ipv4/tcp_rate.c | 12 +++++++++++- 4 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fdcd00ffcb66..a17ae7b85218 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -213,7 +213,8 @@ struct tcp_sock { u8 reord; /* reordering detected */ } rack; u16 advmss; /* Advertised MSS */ - u8 unused; + u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ + unused:7; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ thin_dupack : 1,/* Fast retransmit on first dupack */ @@ -271,6 +272,8 @@ struct tcp_sock { u32 app_limited; /* limited until "delivered" reaches this val */ struct skb_mstamp first_tx_mstamp; /* start of window send phase */ struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ + u32 rate_delivered; /* saved rate sample: packets delivered */ + u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 482898fc433a..73ac0db487f8 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -167,6 +167,7 @@ struct tcp_info { __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + __u8 tcpi_delivery_rate_app_limited:1; __u32 tcpi_rto; __u32 tcpi_ato; @@ -211,6 +212,8 @@ struct tcp_info { __u32 tcpi_min_rtt; __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + __u64 tcpi_delivery_rate; }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2250f891f931..f253e5019d22 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2712,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_time_stamp, intv; unsigned int start; int notsent_bytes; u64 rate64; @@ -2802,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; + + info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; + rate = READ_ONCE(tp->rate_delivered); + intv = READ_ONCE(tp->rate_interval_us); + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + put_unaligned(rate64, &info->tcpi_delivery_rate); + } } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 52ff84be59ab..9be1581a5a08 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -149,12 +149,22 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * for connections suffer heavy or prolonged losses. */ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - rs->interval_us = -1; if (!rs->is_retrans) pr_debug("tcp rate: %ld %d %u %u %u\n", rs->interval_us, rs->delivered, inet_csk(sk)->icsk_ca_state, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; } } -- cgit v1.2.3 From ed6e7268b930e0a9a65d895d368eac79a438d992 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:17 -0400 Subject: tcp: allow congestion control module to request TSO skb segment count Add the tso_segs_goal() function in tcp_congestion_ops to allow the congestion control module to specify the number of segments that should be in a TSO skb sent by tcp_write_xmit() and tcp_xmit_retransmit_queue(). The congestion control module can either request a particular number of segments in TSO skb that we transmit, or return 0 if it doesn't care. This allows the upcoming BBR congestion control module to select small TSO skb sizes if the module detects that the bottleneck bandwidth is very low, or that the connection is policed to a low rate. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_output.c | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a69ed7f0030c..f8f581fd05f5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -913,6 +913,8 @@ struct tcp_congestion_ops { u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + /* suggest number of segments for each skb to transmit (optional) */ + u32 (*tso_segs_goal)(struct sock *sk); /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e02c8ebf3ed4..01379567a732 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1566,6 +1566,17 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) return min_t(u32, segs, sk->sk_gso_max_segs); } +/* Return the number of segments we want in the skb we are transmitting. + * See if congestion control module wants to decide; otherwise, autosize. + */ +static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + + return tso_segs ? : tcp_tso_autosize(sk, mss_now); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -2061,7 +2072,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } - max_segs = tcp_tso_autosize(sk, mss_now); + max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2778,7 +2789,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) last_lost = tp->snd_una; } - max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk)); + max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); tcp_for_write_queue_from(skb, sk) { __u8 sacked; int segs; -- cgit v1.2.3 From 1b3878ca1551f3baab2c408d1e703b5ef785a1b2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:18 -0400 Subject: tcp: export tcp_tso_autosize() and parameterize minimum number of TSO segments To allow congestion control modules to use the default TSO auto-sizing algorithm as one of the ingredients in their own decision about TSO sizing: 1) Export tcp_tso_autosize() so that CC modules can use it. 2) Change tcp_tso_autosize() to allow callers to specify a minimum number of segments per TSO skb, in case the congestion control module has a different notion of the best floor for TSO skbs for the connection right now. For very low-rate paths or policed connections it can be appropriate to use smaller TSO skbs. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_output.c | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f8f581fd05f5..349204130d84 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -533,6 +533,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); bool tcp_may_send_now(struct sock *sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 01379567a732..0bf3d481fa85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1549,7 +1549,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, /* Return how many segs we'd like on a TSO packet, * to send one TSO packet per ms */ -static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs) { u32 bytes, segs; @@ -1561,10 +1562,11 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) * This preserves ACK clocking and is consistent * with tcp_tso_should_defer() heuristic. */ - segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + segs = max_t(u32, bytes / mss_now, min_tso_segs); return min_t(u32, segs, sk->sk_gso_max_segs); } +EXPORT_SYMBOL(tcp_tso_autosize); /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. @@ -1574,7 +1576,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; - return tso_segs ? : tcp_tso_autosize(sk, mss_now); + return tso_segs ? : + tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); } /* Returns the portion of skb which can be sent right away */ -- cgit v1.2.3 From 77bfc174c38e558a3425d3b069aa2762b2fedfdd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:20 -0400 Subject: tcp: allow congestion control to expand send buffer differently Currently the TCP send buffer expands to twice cwnd, in order to allow limited transmits in the CA_Recovery state. This assumes that cwnd does not increase in the CA_Recovery. For some congestion control algorithms, like the upcoming BBR module, if the losses in recovery do not indicate congestion then we may continue to raise cwnd multiplicatively in recovery. In such cases the current multiplier will falsely limit the sending rate, much as if it were limited by the application. This commit adds an optional congestion control callback to use a different multiplier to expand the TCP send buffer. For congestion control modules that do not specificy this callback, TCP continues to use the previous default of 2. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 349204130d84..1aa9628ae608 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -917,6 +917,8 @@ struct tcp_congestion_ops { void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* suggest number of segments for each skb to transmit (optional) */ u32 (*tso_segs_goal)(struct sock *sk); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + u32 (*sndbuf_expand)(struct sock *sk); /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d9ed4bb96f74..13a2e70141f5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; int sndmem, per_mss; u32 nr_segs; @@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk) * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to POLLOUT) */ - sndmem = 2 * nr_segs * per_mss; + sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; + sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); -- cgit v1.2.3 From c0402760f565ae066621ebf8720a32fba074d538 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:21 -0400 Subject: tcp: new CC hook to set sending rate with rate_sample in any CA state This commit introduces an optional new "omnipotent" hook, cong_control(), for congestion control modules. The cong_control() function is called at the end of processing an ACK (i.e., after updating sequence numbers, the SACK scoreboard, and loss detection). At that moment we have precise delivery rate information the congestion control module can use to control the sending behavior (using cwnd, TSO skb size, and pacing rate) in any CA state. This function can also be used by a congestion control that prefers not to use the default cwnd reduction approach (i.e., the PRR algorithm) during CA_Recovery to control the cwnd and sending rate during loss recovery. We take advantage of the fact that recent changes defer the retransmission or transmission of new data (e.g. by F-RTO) in recovery until the new tcp_cong_control() function is run. With this commit, we only run tcp_update_pacing_rate() if the congestion control is not using this new API. New congestion controls which use the new API do not want the TCP stack to run the default pacing rate calculation and overwrite whatever pacing rate they have chosen at initialization time. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++++ net/ipv4/tcp_cong.c | 2 +- net/ipv4/tcp_input.c | 17 ++++++++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1aa9628ae608..f83b7f220a65 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -919,6 +919,10 @@ struct tcp_congestion_ops { u32 (*tso_segs_goal)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) + */ + void (*cong_control)(struct sock *sk, const struct rate_sample *rs); /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 882caa4e72bc..1294af4e0127 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid) { + if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 13a2e70141f5..980a83edfa63 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2536,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + if (inet_csk(sk)->icsk_ca_ops->cong_control) + return; + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { @@ -3312,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) * information. All transmission or retransmission are delayed afterwards. */ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, - int flag) + int flag, const struct rate_sample *rs) { + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->cong_control) { + icsk->icsk_ca_ops->cong_control(sk, rs); + return; + } + if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ tcp_cwnd_reduction(sk, acked_sacked, flag); @@ -3683,7 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, &now, &rs); - tcp_cong_control(sk, ack, delivered, flag); + tcp_cong_control(sk, ack, delivered, flag, &rs); tcp_xmit_recovery(sk, rexmit); return 1; @@ -5982,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else tcp_init_metrics(sk); - tcp_update_pacing_rate(sk); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ tp->lsndtime = tcp_time_stamp; -- cgit v1.2.3 From 7e744171386ae6da1248d3d27d10b6dbdc54f0ff Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:22 -0400 Subject: tcp: increase ICSK_CA_PRIV_SIZE from 64 bytes to 88 The TCP CUBIC module already uses 64 bytes. The upcoming TCP BBR module uses 88 bytes. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 49dcad4fe99e..197a30d221e9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -134,8 +134,8 @@ struct inet_connection_sock { } icsk_mtup; u32 icsk_user_timeout; - u64 icsk_ca_priv[64 / sizeof(u64)]; -#define ICSK_CA_PRIV_SIZE (8 * sizeof(u64)) + u64 icsk_ca_priv[88 / sizeof(u64)]; +#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64)) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ -- cgit v1.2.3 From 0f8782ea14974ce992618b55f0c041ef43ed0b78 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:23 -0400 Subject: tcp_bbr: add BBR congestion control This commit implements a new TCP congestion control algorithm: BBR (Bottleneck Bandwidth and RTT). A detailed description of BBR will be published in ACM Queue, Vol. 14 No. 5, September-October 2016, as "BBR: Congestion-Based Congestion Control". BBR has significantly increased throughput and reduced latency for connections on Google's internal backbone networks and google.com and YouTube Web servers. BBR requires only changes on the sender side, not in the network or the receiver side. Thus it can be incrementally deployed on today's Internet, or in datacenters. The Internet has predominantly used loss-based congestion control (largely Reno or CUBIC) since the 1980s, relying on packet loss as the signal to slow down. While this worked well for many years, loss-based congestion control is unfortunately out-dated in today's networks. On today's Internet, loss-based congestion control causes the infamous bufferbloat problem, often causing seconds of needless queuing delay, since it fills the bloated buffers in many last-mile links. On today's high-speed long-haul links using commodity switches with shallow buffers, loss-based congestion control has abysmal throughput because it over-reacts to losses caused by transient traffic bursts. In 1981 Kleinrock and Gale showed that the optimal operating point for a network maximizes delivered bandwidth while minimizing delay and loss, not only for single connections but for the network as a whole. Finding that optimal operating point has been elusive, since any single network measurement is ambiguous: network measurements are the result of both bandwidth and propagation delay, and those two cannot be measured simultaneously. While it is impossible to disambiguate any single bandwidth or RTT measurement, a connection's behavior over time tells a clearer story. BBR uses a measurement strategy designed to resolve this ambiguity. It combines these measurements with a robust servo loop using recent control systems advances to implement a distributed congestion control algorithm that reacts to actual congestion, not packet loss or transient queue delay, and is designed to converge with high probability to a point near the optimal operating point. In a nutshell, BBR creates an explicit model of the network pipe by sequentially probing the bottleneck bandwidth and RTT. On the arrival of each ACK, BBR derives the current delivery rate of the last round trip, and feeds it through a windowed max-filter to estimate the bottleneck bandwidth. Conversely it uses a windowed min-filter to estimate the round trip propagation delay. The max-filtered bandwidth and min-filtered RTT estimates form BBR's model of the network pipe. Using its model, BBR sets control parameters to govern sending behavior. The primary control is the pacing rate: BBR applies a gain multiplier to transmit faster or slower than the observed bottleneck bandwidth. The conventional congestion window (cwnd) is now the secondary control; the cwnd is set to a small multiple of the estimated BDP (bandwidth-delay product) in order to allow full utilization and bandwidth probing while bounding the potential amount of queue at the bottleneck. When a BBR connection starts, it enters STARTUP mode and applies a high gain to perform an exponential search to quickly probe the bottleneck bandwidth (doubling its sending rate each round trip, like slow start). However, instead of continuing until it fills up the buffer (i.e. a loss), or until delay or ACK spacing reaches some threshold (like Hystart), it uses its model of the pipe to estimate when that pipe is full: it estimates the pipe is full when it notices the estimated bandwidth has stopped growing. At that point it exits STARTUP and enters DRAIN mode, where it reduces its pacing rate to drain the queue it estimates it has created. Then BBR enters steady state. In steady state, PROBE_BW mode cycles between first pacing faster to probe for more bandwidth, then pacing slower to drain any queue that created if no more bandwidth was available, and then cruising at the estimated bandwidth to utilize the pipe without creating excess queue. Occasionally, on an as-needed basis, it sends significantly slower to probe for RTT (PROBE_RTT mode). BBR has been fully deployed on Google's wide-area backbone networks and we're experimenting with BBR on Google.com and YouTube on a global scale. Replacing CUBIC with BBR has resulted in significant improvements in network latency and application (RPC, browser, and video) metrics. For more details please refer to our upcoming ACM Queue publication. Example performance results, to illustrate the difference between BBR and CUBIC: Resilience to random loss (e.g. from shallow buffers): Consider a netperf TCP_STREAM test lasting 30 secs on an emulated path with a 10Gbps bottleneck, 100ms RTT, and 1% packet loss rate. CUBIC gets 3.27 Mbps, and BBR gets 9150 Mbps (2798x higher). Low latency with the bloated buffers common in today's last-mile links: Consider a netperf TCP_STREAM test lasting 120 secs on an emulated path with a 10Mbps bottleneck, 40ms RTT, and 1000-packet bottleneck buffer. Both fully utilize the bottleneck bandwidth, but BBR achieves this with a median RTT 25x lower (43 ms instead of 1.09 secs). Our long-term goal is to improve the congestion control algorithms used on the Internet. We are hopeful that BBR can help advance the efforts toward this goal, and motivate the community to do further research. Test results, performance evaluations, feedback, and BBR-related discussions are very welcome in the public e-mail list for BBR: https://groups.google.com/forum/#!forum/bbr-dev NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, since pacing is integral to the BBR design and implementation. BBR without pacing would not function properly, and may incur unnecessary high packet loss rates. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 13 + net/ipv4/Kconfig | 18 + net/ipv4/Makefile | 1 + net/ipv4/tcp_bbr.c | 896 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 928 insertions(+) create mode 100644 net/ipv4/tcp_bbr.c (limited to 'include') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index b5c366f87b3e..509cd961068d 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -124,6 +124,7 @@ enum { INET_DIAG_PEERS, INET_DIAG_PAD, INET_DIAG_MARK, + INET_DIAG_BBRINFO, __INET_DIAG_MAX, }; @@ -157,8 +158,20 @@ struct tcp_dctcp_info { __u32 dctcp_ab_tot; }; +/* INET_DIAG_BBRINFO */ + +struct tcp_bbr_info { + /* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */ + __u32 bbr_bw_lo; /* lower 32 bits of bw */ + __u32 bbr_bw_hi; /* upper 32 bits of bw */ + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ +}; + union tcp_cc_info { struct tcpvegas_info vegas; struct tcp_dctcp_info dctcp; + struct tcp_bbr_info bbr; }; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 50d6a9b49f6c..300b06888fdf 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -640,6 +640,21 @@ config TCP_CONG_CDG D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg +config TCP_CONG_BBR + tristate "BBR TCP" + default n + ---help--- + + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -674,6 +689,9 @@ choice config DEFAULT_CDG bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_BBR + bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_RENO bool "Reno" endchoice diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9cfff1a0bf71..bc6a6c8b9bcd 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o +obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c new file mode 100644 index 000000000000..0ea66c2c9344 --- /dev/null +++ b/net/ipv4/tcp_bbr.c @@ -0,0 +1,896 @@ +/* Bottleneck Bandwidth and RTT (BBR) congestion control + * + * BBR congestion control computes the sending rate based on the delivery + * rate (throughput) estimated from ACKs. In a nutshell: + * + * On each ACK, update our model of the network path: + * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) + * min_rtt = windowed_min(rtt, 10 seconds) + * pacing_rate = pacing_gain * bottleneck_bandwidth + * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) + * + * The core algorithm does not react directly to packet losses or delays, + * although BBR may adjust the size of next send per ACK when loss is + * observed, or adjust the sending rate if it estimates there is a + * traffic policer, in order to keep the drop rate reasonable. + * + * BBR is described in detail in: + * "BBR: Congestion-Based Congestion Control", + * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, + * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. + * + * There is a public e-mail list for discussing BBR development and testing: + * https://groups.google.com/forum/#!forum/bbr-dev + * + * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, + * since pacing is integral to the BBR design and implementation. + * BBR without pacing would not function properly, and may incur unnecessary + * high packet loss rates. + */ +#include +#include +#include +#include +#include +#include + +/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. + * Since the minimum window is >=4 packets, the lower bound isn't + * an issue. The upper bound isn't an issue with existing technologies. + */ +#define BW_SCALE 24 +#define BW_UNIT (1 << BW_SCALE) + +#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ +#define BBR_UNIT (1 << BBR_SCALE) + +/* BBR has the following modes for deciding how fast to send: */ +enum bbr_mode { + BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ + BBR_DRAIN, /* drain any queue created during startup */ + BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ + BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ +}; + +/* BBR congestion control block */ +struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ + struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ + u32 rtt_cnt; /* count of packet-timed rounds elapsed */ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ + restore_cwnd:1, /* decided to revert cwnd to old value */ + round_start:1, /* start of packet-timed tx->ack round? */ + tso_segs_goal:7, /* segments we want in each skb we send */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ + unused:5, + lt_is_sampling:1, /* taking long-term ("LT") samples now? */ + lt_rtt_cnt:7, /* round trips in long-term interval */ + lt_use_bw:1; /* use lt_bw as our bw estimate? */ + u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ + u32 lt_last_delivered; /* LT intvl start: tp->delivered */ + u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ + u32 lt_last_lost; /* LT intvl start: tp->lost */ + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_cnt:3, /* number of rounds without large bw gains */ + cycle_idx:3, /* current index in pacing_gain cycle array */ + unused_b:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ +}; + +#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ + +/* Window length of bw filter (in rounds): */ +static const int bbr_bw_rtts = CYCLE_LEN + 2; +/* Window length of min_rtt filter (in sec): */ +static const u32 bbr_min_rtt_win_sec = 10; +/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ +static const u32 bbr_probe_rtt_mode_ms = 200; +/* Skip TSO below the following bandwidth (bits/sec): */ +static const int bbr_min_tso_rate = 1200000; + +/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ +static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ +static const int bbr_cwnd_gain = BBR_UNIT * 2; +/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ +static const int bbr_pacing_gain[] = { + BBR_UNIT * 5 / 4, /* probe for more available bw */ + BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ + BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ + BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ +}; +/* Randomize the starting gain cycling phase over N phases: */ +static const u32 bbr_cycle_rand = 7; + +/* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet + * needs at least 4 packets in flight: + */ +static const u32 bbr_cwnd_min_target = 4; + +/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* If bw has increased significantly (1.25x), there may be more bw available: */ +static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; +/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ +static const u32 bbr_full_bw_cnt = 3; + +/* "long-term" ("LT") bandwidth estimator parameters... */ +/* The minimum number of rounds in an LT bw sampling interval: */ +static const u32 bbr_lt_intvl_min_rtts = 4; +/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +static const u32 bbr_lt_loss_thresh = 50; +/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +static const u32 bbr_lt_bw_diff = 4000 / 8; +/* If we estimate we're policed, use lt_bw for this many round trips: */ +static const u32 bbr_lt_bw_max_rtts = 48; + +/* Do we estimate that STARTUP filled the pipe? */ +static bool bbr_full_bw_reached(const struct sock *sk) +{ + const struct bbr *bbr = inet_csk_ca(sk); + + return bbr->full_bw_cnt >= bbr_full_bw_cnt; +} + +/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ +static u32 bbr_max_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return minmax_get(&bbr->bw); +} + +/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ +static u32 bbr_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); +} + +/* Return rate in bytes per second, optionally with a gain. + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) +{ + rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + rate *= gain; + rate >>= BBR_SCALE; + rate *= USEC_PER_SEC; + return rate >> BW_SCALE; +} + +/* Pace using current bw estimate and a gain factor. In order to help drive the + * network toward lower queues while maintaining high utilization and low + * latency, the average pacing rate aims to be slightly (~1%) lower than the + * estimated bandwidth. This is an important aspect of the design. In this + * implementation this slightly lower pacing rate is achieved implicitly by not + * including link-layer headers in the packet size used for the pacing rate. + */ +static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + sk->sk_pacing_rate = rate; +} + +/* Return count of segments we want in the skbs we send, or 0 for default. */ +static u32 bbr_tso_segs_goal(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return bbr->tso_segs_goal; +} + +static void bbr_set_tso_segs_goal(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 min_segs; + + min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), + 0x7FU); +} + +/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +static void bbr_save_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) + bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ + bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); +} + +static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (event == CA_EVENT_TX_START && tp->app_limited) { + bbr->idle_restart = 1; + /* Avoid pointless buffer overflows: pace at est. bw if we don't + * need more speed (we're restarting from idle and app-limited). + */ + if (bbr->mode == BBR_PROBE_BW) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + } +} + +/* Find target cwnd. Right-size the cwnd based on min RTT and the + * estimated bottleneck bandwidth: + * + * cwnd = bw * min_rtt * gain = BDP * gain + * + * The key factor, gain, controls the amount of queue. While a small gain + * builds a smaller queue, it becomes more vulnerable to noise in RTT + * measurements (e.g., delayed ACKs or other ACK compression effects). This + * noise may cause BBR to under-estimate the rate. + * + * To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 cwnd; + u64 w; + + /* If we've never had a valid RTT sample, cap cwnd at the initial + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which + * case we need to slow-start up toward something safe: TCP_INIT_CWND. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ + + w = (u64)bw * bbr->min_rtt_us; + + /* Apply a gain to the given value, then remove the BW_SCALE shift. */ + cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + /* Allow enough full-sized skbs in flight to utilize end systems. */ + cwnd += 3 * bbr->tso_segs_goal; + + /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ + cwnd = (cwnd + 1) & ~1U; + + return cwnd; +} + +/* An optimization in BBR to reduce losses: On the first round of recovery, we + * follow the packet conservation principle: send P packets per P packets acked. + * After that, we slow-start and send at most 2*P packets per P packets acked. + * After recovery finishes, or upon undo, we restore the cwnd we had when + * recovery started (capped by the target cwnd based on estimated BDP). + * + * TODO(ycheng/ncardwell): implement a rate-based approach. + */ +static bool bbr_set_cwnd_to_recover_or_restore( + struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; + u32 cwnd = tp->snd_cwnd; + + /* An ACK for P pkts should release at most 2*P packets. We do this + * in two steps. First, here we deduct the number of lost packets. + * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. + */ + if (rs->losses > 0) + cwnd = max_t(s32, cwnd - rs->losses, 1); + + if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { + /* Starting 1st round of Recovery, so do packet conservation. */ + bbr->packet_conservation = 1; + bbr->next_rtt_delivered = tp->delivered; /* start round now */ + /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ + cwnd = tcp_packets_in_flight(tp) + acked; + } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { + /* Exiting loss recovery; restore cwnd saved before recovery. */ + bbr->restore_cwnd = 1; + bbr->packet_conservation = 0; + } + bbr->prev_ca_state = state; + + if (bbr->restore_cwnd) { + /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ + cwnd = max(cwnd, bbr->prior_cwnd); + bbr->restore_cwnd = 0; + } + + if (bbr->packet_conservation) { + *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); + return true; /* yes, using packet conservation */ + } + *new_cwnd = cwnd; + return false; +} + +/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ +static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + u32 acked, u32 bw, int gain) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cwnd = 0, target_cwnd = 0; + + if (!acked) + return; + + if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) + goto done; + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ + target_cwnd = bbr_target_cwnd(sk, bw, gain); + if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ + cwnd = min(cwnd + acked, target_cwnd); + else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) + cwnd = cwnd + acked; + cwnd = max(cwnd, bbr_cwnd_min_target); + +done: + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ + tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); +} + +/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +static bool bbr_is_next_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool is_full_length = + skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + bbr->min_rtt_us; + u32 inflight, bw; + + /* The pacing_gain of 1.0 paces at the estimated bw to try to fully + * use the pipe without increasing the queue. + */ + if (bbr->pacing_gain == BBR_UNIT) + return is_full_length; /* just use wall clock time */ + + inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ + bw = bbr_max_bw(sk); + + /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at + * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is + * small (e.g. on a LAN). We do not persist if packets are lost, since + * a path with small buffers may not hold that much. + */ + if (bbr->pacing_gain > BBR_UNIT) + return is_full_length && + (rs->losses || /* perhaps pacing_gain*BDP won't fit */ + inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + + /* A pacing_gain < 1.0 tries to drain extra queue we added if bw + * probing didn't find more bw. If inflight falls to match BDP then we + * estimate queue is drained; persisting would underutilize the pipe. + */ + return is_full_length || + inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); +} + +static void bbr_advance_cycle_phase(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); + bbr->cycle_mstamp = tp->delivered_mstamp; + bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; +} + +/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +static void bbr_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && + bbr_is_next_cycle_phase(sk, rs)) + bbr_advance_cycle_phase(sk); +} + +static void bbr_reset_startup_mode(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->mode = BBR_STARTUP; + bbr->pacing_gain = bbr_high_gain; + bbr->cwnd_gain = bbr_high_gain; +} + +static void bbr_reset_probe_bw_mode(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->mode = BBR_PROBE_BW; + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = bbr_cwnd_gain; + bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); + bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +} + +static void bbr_reset_mode(struct sock *sk) +{ + if (!bbr_full_bw_reached(sk)) + bbr_reset_startup_mode(sk); + else + bbr_reset_probe_bw_mode(sk); +} + +/* Start a new long-term sampling interval. */ +static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_delivered = tp->delivered; + bbr->lt_last_lost = tp->lost; + bbr->lt_rtt_cnt = 0; +} + +/* Completely reset long-term bandwidth sampling. */ +static void bbr_reset_lt_bw_sampling(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->lt_bw = 0; + bbr->lt_use_bw = 0; + bbr->lt_is_sampling = false; + bbr_reset_lt_bw_sampling_interval(sk); +} + +/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 diff; + + if (bbr->lt_bw) { /* do we have bw from a previous interval? */ + /* Is new bw close to the lt_bw from the previous interval? */ + diff = abs(bw - bbr->lt_bw); + if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || + (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= + bbr_lt_bw_diff)) { + /* All criteria are met; estimate we're policed. */ + bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ + bbr->lt_use_bw = 1; + bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ + bbr->lt_rtt_cnt = 0; + return; + } + } + bbr->lt_bw = bw; + bbr_reset_lt_bw_sampling_interval(sk); +} + +/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of + * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and + * explicitly models their policed rate, to reduce unnecessary losses. We + * estimate that we're policed if we see 2 consecutive sampling intervals with + * consistent throughput and high packet loss. If we think we're being policed, + * set lt_bw to the "long-term" average delivery rate from those 2 intervals. + */ +static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 lost, delivered; + u64 bw; + s32 t; + + if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ + if (bbr->mode == BBR_PROBE_BW && bbr->round_start && + ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { + bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ + bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ + } + return; + } + + /* Wait for the first loss before sampling, to let the policer exhaust + * its tokens and estimate the steady-state rate allowed by the policer. + * Starting samples earlier includes bursts that over-estimate the bw. + */ + if (!bbr->lt_is_sampling) { + if (!rs->losses) + return; + bbr_reset_lt_bw_sampling_interval(sk); + bbr->lt_is_sampling = true; + } + + /* To avoid underestimates, reset sampling if we run out of data. */ + if (rs->is_app_limited) { + bbr_reset_lt_bw_sampling(sk); + return; + } + + if (bbr->round_start) + bbr->lt_rtt_cnt++; /* count round trips in this interval */ + if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) + return; /* sampling interval needs to be longer */ + if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { + bbr_reset_lt_bw_sampling(sk); /* interval is too long */ + return; + } + + /* End sampling interval when a packet is lost, so we estimate the + * policer tokens were exhausted. Stopping the sampling before the + * tokens are exhausted under-estimates the policed rate. + */ + if (!rs->losses) + return; + + /* Calculate packets lost and delivered in sampling interval. */ + lost = tp->lost - bbr->lt_last_lost; + delivered = tp->delivered - bbr->lt_last_delivered; + /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ + if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) + return; + + /* Find average delivery rate in this sampling interval. */ + t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); + if (t < 1) + return; /* interval is less than one jiffy, so wait */ + t = jiffies_to_usecs(t); + /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ + if (t < 1) { + bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ + return; + } + bw = (u64)delivered * BW_UNIT; + do_div(bw, t); + bbr_lt_bw_interval_done(sk, bw); +} + +/* Estimate the bandwidth based on how fast packets are delivered */ +static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->round_start = 0; + if (rs->delivered < 0 || rs->interval_us <= 0) + return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ + if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { + bbr->next_rtt_delivered = tp->delivered; + bbr->rtt_cnt++; + bbr->round_start = 1; + bbr->packet_conservation = 0; + } + + bbr_lt_bw_sampling(sk, rs); + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + */ + bw = (u64)rs->delivered * BW_UNIT; + do_div(bw, rs->interval_us); + + /* If this sample is application-limited, it is likely to have a very + * low delivered count that represents application behavior rather than + * the available network rate. Such a sample could drag down estimated + * bw, causing needless slow-down. Thus, to continue to send at the + * last measured network rate, we filter out app-limited samples unless + * they describe the path bw at least as well as our bw model. + * + * So the goal during app-limited phase is to proceed with the best + * network rate no matter how long. We automatically leave this + * phase when app writes faster than the network can deliver :) + */ + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { + /* Incorporate new sample into our max bw filter. */ + minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); + } +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh; + + if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) + return; + + bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; + if (bbr_max_bw(sk) >= bw_thresh) { + bbr->full_bw = bbr_max_bw(sk); + bbr->full_bw_cnt = 0; + return; + } + ++bbr->full_bw_cnt; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ + bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + tcp_packets_in_flight(tcp_sk(sk)) <= + bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +} + +/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and + * periodically drain the bottleneck queue, to converge to measure the true + * min_rtt (unloaded propagation delay). This allows the flows to keep queues + * small (reducing queuing delay and packet loss) and achieve fairness among + * BBR flows. + * + * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, + * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed + * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * re-enter the previous mode. BBR uses 200ms to approximately bound the + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * + * Note that flows need only pay 2% if they are busy sending over the last 10 + * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have + * natural silences or low-rate periods within 10 seconds where the rate is low + * enough for long enough to drain its queue in the bottleneck. We pick up + * these min RTT measurements opportunistically with our min_rtt filter. :-) + */ +static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool filter_expired; + + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + filter_expired = after(tcp_time_stamp, + bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + bbr->min_rtt_us = rs->rtt_us; + bbr->min_rtt_stamp = tcp_time_stamp; + } + + if (bbr_probe_rtt_mode_ms > 0 && filter_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + } + + if (bbr->mode == BBR_PROBE_RTT) { + /* Ignore low rate samples during this mode. */ + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { + bbr->probe_rtt_done_stamp = tcp_time_stamp + + msecs_to_jiffies(bbr_probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done && + after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_time_stamp; + bbr->restore_cwnd = 1; /* snap to prior_cwnd */ + bbr_reset_mode(sk); + } + } + } + bbr->idle_restart = 0; +} + +static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) +{ + bbr_update_bw(sk, rs); + bbr_update_cycle_phase(sk, rs); + bbr_check_full_bw_reached(sk, rs); + bbr_check_drain(sk, rs); + bbr_update_min_rtt(sk, rs); +} + +static void bbr_main(struct sock *sk, const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw; + + bbr_update_model(sk, rs); + + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_tso_segs_goal(sk); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->prior_cwnd = 0; + bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ + bbr->rtt_cnt = 0; + bbr->next_rtt_delivered = 0; + bbr->prev_ca_state = TCP_CA_Open; + bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_time_stamp; + + minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + + /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); + sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ + bbr_set_pacing_rate(sk, bw, bbr_high_gain); + + bbr->restore_cwnd = 0; + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp.v64 = 0; + bbr->cycle_idx = 0; + bbr_reset_lt_bw_sampling(sk); + bbr_reset_startup_mode(sk); +} + +static u32 bbr_sndbuf_expand(struct sock *sk) +{ + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; +} + +/* In theory BBR does not need to undo the cwnd since it does not + * always reduce cwnd on losses (see bbr_main()). Keep it for now. + */ +static u32 bbr_undo_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ +static u32 bbr_ssthresh(struct sock *sk) +{ + bbr_save_cwnd(sk); + return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ +} + +static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = bbr_bw(sk); + + bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; + memset(&info->bbr, 0, sizeof(info->bbr)); + info->bbr.bbr_bw_lo = (u32)bw; + info->bbr.bbr_bw_hi = (u32)(bw >> 32); + info->bbr.bbr_min_rtt = bbr->min_rtt_us; + info->bbr.bbr_pacing_gain = bbr->pacing_gain; + info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; + *attr = INET_DIAG_BBRINFO; + return sizeof(info->bbr); + } + return 0; +} + +static void bbr_set_state(struct sock *sk, u8 new_state) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; + bbr->full_bw = 0; + bbr->round_start = 1; /* treat RTO like end of a round */ + bbr_lt_bw_sampling(sk, &rs); + } +} + +static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { + .flags = TCP_CONG_NON_RESTRICTED, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, + .tso_segs_goal = bbr_tso_segs_goal, + .get_info = bbr_get_info, + .set_state = bbr_set_state, +}; + +static int __init bbr_register(void) +{ + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_bbr_cong_ops); +} + +static void __exit bbr_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_bbr_cong_ops); +} + +module_init(bbr_register); +module_exit(bbr_unregister); + +MODULE_AUTHOR("Van Jacobson "); +MODULE_AUTHOR("Neal Cardwell "); +MODULE_AUTHOR("Yuchung Cheng "); +MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); -- cgit v1.2.3 From 332ae8e2f6ecda5e50c5c62ed62894963e3a83f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:53 +0100 Subject: net: cls_bpf: add hardware offload This patch adds hardware offload capability to cls_bpf classifier, similar to what have been done with U32 and flower. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ include/net/pkt_cls.h | 14 ++++++++++ net/sched/cls_bpf.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a10d8d18ce19..69f242c71865 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -789,6 +789,7 @@ enum { TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, TC_SETUP_MATCHALL, + TC_SETUP_CLSBPF, }; struct tc_cls_u32_offload; @@ -800,6 +801,7 @@ struct tc_to_netdev { struct tc_cls_u32_offload *cls_u32; struct tc_cls_flower_offload *cls_flower; struct tc_cls_matchall_offload *cls_mall; + struct tc_cls_bpf_offload *cls_bpf; }; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a459be5fe1c2..41e8071dff87 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -486,4 +486,18 @@ struct tc_cls_matchall_offload { unsigned long cookie; }; +enum tc_clsbpf_command { + TC_CLSBPF_ADD, + TC_CLSBPF_REPLACE, + TC_CLSBPF_DESTROY, +}; + +struct tc_cls_bpf_offload { + enum tc_clsbpf_command command; + struct tcf_exts *exts; + struct bpf_prog *prog; + const char *name; + bool exts_integrated; +}; + #endif diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index c6f7a47541eb..6523c5b4c0a5 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -39,6 +39,7 @@ struct cls_bpf_prog { struct list_head link; struct tcf_result res; bool exts_integrated; + bool offloaded; struct tcf_exts exts; u32 handle; union { @@ -138,6 +139,71 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) return !prog->bpf_ops; } +static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, + enum tc_clsbpf_command cmd) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_bpf_offload bpf_offload = {}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSBPF; + offload.cls_bpf = &bpf_offload; + + bpf_offload.command = cmd; + bpf_offload.exts = &prog->exts; + bpf_offload.prog = prog->filter; + bpf_offload.name = prog->bpf_name; + bpf_offload.exts_integrated = prog->exts_integrated; + + return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); +} + +static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, + struct cls_bpf_prog *oldprog) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_bpf_prog *obj = prog; + enum tc_clsbpf_command cmd; + + if (oldprog && oldprog->offloaded) { + if (tc_should_offload(dev, tp, 0)) { + cmd = TC_CLSBPF_REPLACE; + } else { + obj = oldprog; + cmd = TC_CLSBPF_DESTROY; + } + } else { + if (!tc_should_offload(dev, tp, 0)) + return; + cmd = TC_CLSBPF_ADD; + } + + if (cls_bpf_offload_cmd(tp, obj, cmd)) + return; + + obj->offloaded = true; + if (oldprog) + oldprog->offloaded = false; +} + +static void cls_bpf_stop_offload(struct tcf_proto *tp, + struct cls_bpf_prog *prog) +{ + int err; + + if (!prog->offloaded) + return; + + err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + if (err) { + pr_err("Stopping hardware offload failed: %d\n", err); + return; + } + + prog->offloaded = false; +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -177,6 +243,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg; + cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); @@ -193,6 +260,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force) return false; list_for_each_entry_safe(prog, tmp, &head->plist, link) { + cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); @@ -415,6 +483,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) goto errout; + cls_bpf_offload(tp, prog, oldprog); + if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); -- cgit v1.2.3 From 0d01d45f1b251448590c710baa32f722e43c62c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:54 +0100 Subject: net: cls_bpf: limit hardware offload by software-only flag Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_HW flag. Unlike U32 and flower cls_bpf already has some netlink flags defined. Create a new attribute to be able to use the same flag values as the above. Unlike U32 and flower reject unknown flags. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + include/uapi/linux/pkt_cls.h | 1 + net/sched/cls_bpf.c | 22 ++++++++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 41e8071dff87..57af9f3032ff 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -498,6 +498,7 @@ struct tc_cls_bpf_offload { struct bpf_prog *prog; const char *name; bool exts_integrated; + u32 gen_flags; }; #endif diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 8915b61bbf83..8fd715f806a2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -396,6 +396,7 @@ enum { TCA_BPF_FD, TCA_BPF_NAME, TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, __TCA_BPF_MAX, }; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6523c5b4c0a5..ebf01f7c1470 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann "); MODULE_DESCRIPTION("TC BPF based classifier"); #define CLS_BPF_NAME_LEN 256 +#define CLS_BPF_SUPPORTED_GEN_FLAGS \ + TCA_CLS_FLAGS_SKIP_HW struct cls_bpf_head { struct list_head plist; @@ -40,6 +42,7 @@ struct cls_bpf_prog { struct tcf_result res; bool exts_integrated; bool offloaded; + u32 gen_flags; struct tcf_exts exts; u32 handle; union { @@ -55,6 +58,7 @@ struct cls_bpf_prog { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_FLAGS] = { .type = NLA_U32 }, + [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, @@ -154,6 +158,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, bpf_offload.prog = prog->filter; bpf_offload.name = prog->bpf_name; bpf_offload.exts_integrated = prog->exts_integrated; + bpf_offload.gen_flags = prog->gen_flags; return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &offload); @@ -167,14 +172,14 @@ static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, enum tc_clsbpf_command cmd; if (oldprog && oldprog->offloaded) { - if (tc_should_offload(dev, tp, 0)) { + if (tc_should_offload(dev, tp, prog->gen_flags)) { cmd = TC_CLSBPF_REPLACE; } else { obj = oldprog; cmd = TC_CLSBPF_DESTROY; } } else { - if (!tc_should_offload(dev, tp, 0)) + if (!tc_should_offload(dev, tp, prog->gen_flags)) return; cmd = TC_CLSBPF_ADD; } @@ -370,6 +375,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, { bool is_bpf, is_ebpf, have_exts = false; struct tcf_exts exts; + u32 gen_flags = 0; int ret; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; @@ -394,8 +400,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; } + if (tb[TCA_BPF_FLAGS_GEN]) { + gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); + if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || + !tc_flags_valid(gen_flags)) { + ret = -EINVAL; + goto errout; + } + } prog->exts_integrated = have_exts; + prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, tp); @@ -568,6 +583,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) goto nla_put_failure; + if (prog->gen_flags && + nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags)) + goto nla_put_failure; nla_nest_end(skb, nest); -- cgit v1.2.3 From 58e2af8b3a6b587e4ac8414343581da4349d3c0f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:57 +0100 Subject: bpf: expose internal verfier structures Move verifier's internal structures to a header file and prefix their names with bpf_ to avoid potential namespace conflicts. Those structures will soon be used by external analyzers. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 79 +++++++++++++ kernel/bpf/verifier.c | 266 +++++++++++++++++-------------------------- 2 files changed, 182 insertions(+), 163 deletions(-) create mode 100644 include/linux/bpf_verifier.h (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h new file mode 100644 index 000000000000..9457a22fc6e0 --- /dev/null +++ b/include/linux/bpf_verifier.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _LINUX_BPF_VERIFIER_H +#define _LINUX_BPF_VERIFIER_H 1 + +#include /* for enum bpf_reg_type */ +#include /* for MAX_BPF_STACK */ + +struct bpf_reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { + u32 id; + u16 off; + u16 range; + }; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* register spilled into stack */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + +/* state of the program: + * type of all registers and stack info + */ +struct bpf_verifier_state { + struct bpf_reg_state regs[MAX_BPF_REG]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; +}; + +/* linked list of verifier states used to prune search */ +struct bpf_verifier_state_list { + struct bpf_verifier_state state; + struct bpf_verifier_state_list *next; +}; + +struct bpf_insn_aux_data { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ +}; + +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct bpf_verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct bpf_verifier_state cur_state; /* current verifier state */ + struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ + bool allow_ptr_leaks; + bool seen_direct_write; + struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ +}; + +#endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9542d89f293..dca2b9b1d02e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -126,82 +127,16 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -struct reg_state { - enum bpf_reg_type type; - union { - /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ - s64 imm; - - /* valid when type == PTR_TO_PACKET* */ - struct { - u32 id; - u16 off; - u16 range; - }; - - /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | - * PTR_TO_MAP_VALUE_OR_NULL - */ - struct bpf_map *map_ptr; - }; -}; - -enum bpf_stack_slot_type { - STACK_INVALID, /* nothing was stored in this stack slot */ - STACK_SPILL, /* register spilled into stack */ - STACK_MISC /* BPF program wrote some data into this slot */ -}; - -#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ - -/* state of the program: - * type of all registers and stack info - */ -struct verifier_state { - struct reg_state regs[MAX_BPF_REG]; - u8 stack_slot_type[MAX_BPF_STACK]; - struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; -}; - -/* linked list of verifier states used to prune search */ -struct verifier_state_list { - struct verifier_state state; - struct verifier_state_list *next; -}; - /* verifier_state + insn_idx are pushed to stack when branch is encountered */ -struct verifier_stack_elem { +struct bpf_verifier_stack_elem { /* verifer state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ - struct verifier_state st; + struct bpf_verifier_state st; int insn_idx; int prev_insn_idx; - struct verifier_stack_elem *next; -}; - -struct bpf_insn_aux_data { - enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ -}; - -#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ - -/* single container for all structs - * one verifier_env per bpf_check() call - */ -struct verifier_env { - struct bpf_prog *prog; /* eBPF program being verified */ - struct verifier_stack_elem *head; /* stack of verifier states to be processed */ - int stack_size; /* number of states to be processed */ - struct verifier_state cur_state; /* current verifier state */ - struct verifier_state_list **explored_states; /* search pruning optimization */ - struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ - u32 used_map_cnt; /* number of used maps */ - u32 id_gen; /* used to generate unique reg IDs */ - bool allow_ptr_leaks; - bool seen_direct_write; - struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + struct bpf_verifier_stack_elem *next; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -254,9 +189,9 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -static void print_verifier_state(struct verifier_state *state) +static void print_verifier_state(struct bpf_verifier_state *state) { - struct reg_state *reg; + struct bpf_reg_state *reg; enum bpf_reg_type t; int i; @@ -432,9 +367,9 @@ static void print_bpf_insn(struct bpf_insn *insn) } } -static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; int insn_idx; if (env->head == NULL) @@ -451,12 +386,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx) return insn_idx; } -static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, - int prev_insn_idx) +static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; - elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; @@ -482,7 +417,7 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void init_reg_state(struct reg_state *regs) +static void init_reg_state(struct bpf_reg_state *regs) { int i; @@ -498,7 +433,7 @@ static void init_reg_state(struct reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; @@ -511,7 +446,7 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static int check_reg_arg(struct reg_state *regs, u32 regno, +static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, enum reg_arg_type t) { if (regno >= MAX_BPF_REG) { @@ -571,8 +506,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct verifier_state *state, int off, int size, - int value_regno) +static int check_stack_write(struct bpf_verifier_state *state, int off, + int size, int value_regno) { int i; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -597,7 +532,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, } else { /* regular write of data into stack */ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - (struct reg_state) {}; + (struct bpf_reg_state) {}; for (i = 0; i < size; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; @@ -605,7 +540,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, return 0; } -static int check_stack_read(struct verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -646,7 +581,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size, } /* check read/write into map element returned by bpf_map_lookup_elem() */ -static int check_map_access(struct verifier_env *env, u32 regno, int off, +static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; @@ -661,7 +596,7 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_access_direct_pkt_data(struct verifier_env *env, +static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta) { switch (env->prog->type) { @@ -678,11 +613,11 @@ static bool may_access_direct_pkt_data(struct verifier_env *env, } } -static int check_packet_access(struct verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *reg = ®s[regno]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *reg = ®s[regno]; off += reg->off; if (off < 0 || size <= 0 || off + size > reg->range) { @@ -694,7 +629,7 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { if (env->prog->aux->ops->is_valid_access && @@ -709,7 +644,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } -static bool is_pointer_value(struct verifier_env *env, int regno) +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { if (env->allow_ptr_leaks) return false; @@ -723,12 +658,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } -static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, - int off, int size) +static int check_ptr_alignment(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int off, int size) { if (reg->type != PTR_TO_PACKET) { if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); + verbose("misaligned access off %d size %d\n", + off, size); return -EACCES; } else { return 0; @@ -769,12 +705,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { - struct verifier_state *state = &env->cur_state; - struct reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; int size, err = 0; if (reg->type == PTR_TO_STACK) @@ -860,9 +796,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || @@ -896,12 +832,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, int regno, +static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs; int off, i; if (regs[regno].type != PTR_TO_STACK) { @@ -940,11 +876,11 @@ static int check_stack_boundary(struct verifier_env *env, int regno, return 0; } -static int check_func_arg(struct verifier_env *env, u32 regno, +static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1149,10 +1085,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -static void clear_all_pkt_pointers(struct verifier_env *env) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs, *reg; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) @@ -1172,12 +1108,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env) } } -static int check_call(struct verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct reg_state *regs = state->regs; - struct reg_state *reg; + struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1280,12 +1216,13 @@ static int check_call(struct verifier_env *env, int func_id) return 0; } -static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +static int check_packet_ptr_add(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; - struct reg_state tmp_reg; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1353,10 +1290,10 @@ add_imm: return 0; } -static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; u8 opcode = BPF_OP(insn->code); s64 imm_log2; @@ -1366,7 +1303,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) */ if (BPF_SRC(insn->code) == BPF_X) { - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && dst_reg->imm && opcode == BPF_ADD) { @@ -1455,11 +1392,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. @@ -1476,9 +1414,9 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) } /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) +static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1652,10 +1590,10 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct verifier_state *state, - const struct reg_state *dst_reg) +static void find_good_pkt_pointers(struct bpf_verifier_state *state, + struct bpf_reg_state *dst_reg) { - struct reg_state *regs = state->regs, *reg; + struct bpf_reg_state *regs = state->regs, *reg; int i; /* LLVM can generate two kind of checks: @@ -1701,11 +1639,11 @@ static void find_good_pkt_pointers(struct verifier_state *state, } } -static int check_cond_jmp_op(struct verifier_env *env, +static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct verifier_state *other_branch, *this_branch = &env->cur_state; - struct reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1767,7 +1705,7 @@ static int check_cond_jmp_op(struct verifier_env *env, if (!other_branch) return -EFAULT; - /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { @@ -1809,9 +1747,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) } /* verify BPF_LD_IMM64 instruction */ -static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -1866,11 +1804,11 @@ static bool may_access_skb(enum bpf_prog_type type) * Output: * R0 - 8/16/32-bit skb data converted to cpu endianness */ -static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct reg_state *reg; + struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -1956,7 +1894,7 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) +#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static int *insn_stack; /* stack of insns to process */ static int cur_stack; /* current stack index */ @@ -1967,7 +1905,7 @@ static int *insn_state; * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -2008,7 +1946,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) /* non-recursive depth-first-search to detect loops in BPF program * loop == back-edge in directed graph */ -static int check_cfg(struct verifier_env *env) +static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2117,7 +2055,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +static bool compare_ptrs_to_packet(struct bpf_reg_state *old, + struct bpf_reg_state *cur) { if (old->id != cur->id) return false; @@ -2192,9 +2131,10 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +static bool states_equal(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) { - struct reg_state *rold, *rcur; + struct bpf_reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -2234,9 +2174,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) * the same, check that stored pointers types * are the same as well. * Ex: explored safe path could have stored - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} * but current path has stored: - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} * such verifier states are not equivalent. * return false to continue verification of this path */ @@ -2247,10 +2187,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) return true; } -static int is_state_visited(struct verifier_env *env, int insn_idx) +static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { - struct verifier_state_list *new_sl; - struct verifier_state_list *sl; + struct bpf_verifier_state_list *new_sl; + struct bpf_verifier_state_list *sl; sl = env->explored_states[insn_idx]; if (!sl) @@ -2274,7 +2214,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); if (!new_sl) return -ENOMEM; @@ -2285,11 +2225,11 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) return 0; } -static int do_check(struct verifier_env *env) +static int do_check(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; @@ -2572,7 +2512,7 @@ static int check_map_prog_compatibility(struct bpf_map *map, /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ -static int replace_map_fd_with_map_ptr(struct verifier_env *env) +static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2669,7 +2609,7 @@ next_insn: } /* drop refcnt of maps used by the rejected program */ -static void release_maps(struct verifier_env *env) +static void release_maps(struct bpf_verifier_env *env) { int i; @@ -2678,7 +2618,7 @@ static void release_maps(struct verifier_env *env) } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ -static void convert_pseudo_ld_imm64(struct verifier_env *env) +static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2692,7 +2632,7 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ -static int convert_ctx_accesses(struct verifier_env *env) +static int convert_ctx_accesses(struct bpf_verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; const int insn_cnt = env->prog->len; @@ -2757,9 +2697,9 @@ static int convert_ctx_accesses(struct verifier_env *env) return 0; } -static void free_states(struct verifier_env *env) +static void free_states(struct bpf_verifier_env *env) { - struct verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl, *sln; int i; if (!env->explored_states) @@ -2782,16 +2722,16 @@ static void free_states(struct verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; - struct verifier_env *env; + struct bpf_verifier_env *env; int ret = -EINVAL; if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; - /* 'struct verifier_env' can be global, but since it's not small, + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -2833,7 +2773,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) goto skip_full_check; env->explored_states = kcalloc(env->prog->len, - sizeof(struct verifier_state_list *), + sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) -- cgit v1.2.3 From 13a27dfc669724564aafa2699976ee756029fed2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:58 +0100 Subject: bpf: enable non-core use of the verfier Advanced JIT compilers and translators may want to use eBPF verifier as a base for parsers or to perform custom checks and validations. Add ability for external users to invoke the verifier and provide callbacks to be invoked for every intruction checked. For now only add most basic callback for per-instruction pre-interpretation checks is added. More advanced users may also like to have per-instruction post callback and state comparison callback. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 11 +++++++ kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9457a22fc6e0..c5cb661712c9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -59,6 +59,12 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +struct bpf_verifier_env; +struct bpf_ext_analyzer_ops { + int (*insn_hook)(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); +}; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -68,6 +74,8 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ struct bpf_verifier_state cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ + void *analyzer_priv; /* pointer to external analyzer's private data */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -76,4 +84,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; +int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, + void *priv); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dca2b9b1d02e..ee86a77dc40b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -632,6 +632,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + /* for analyzer ctx accesses are already validated and converted */ + if (env->analyzer_ops) + return 0; + if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { /* remember the offset of last byte accessed in ctx */ @@ -2225,6 +2229,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } +static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) + return 0; + + return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); +} + static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state = &env->cur_state; @@ -2283,6 +2296,10 @@ static int do_check(struct bpf_verifier_env *env) print_bpf_insn(insn); } + err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); + if (err) + return err; + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -2845,3 +2862,54 @@ err_free_env: kfree(env); return ret; } + +int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, + void *priv) +{ + struct bpf_verifier_env *env; + int ret; + + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + prog->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; + env->prog = prog; + env->analyzer_ops = ops; + env->analyzer_priv = priv; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + log_level = 0; + + env->explored_states = kcalloc(env->prog->len, + sizeof(struct bpf_verifier_state_list *), + GFP_KERNEL); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + + mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: + kfree(env); + return ret; +} +EXPORT_SYMBOL_GPL(bpf_analyzer); -- cgit v1.2.3 From 68d640630d4ef2a4bf3f68b5073dec5e4c4f878b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:44:02 +0100 Subject: net: cls_bpf: allow offloaded filters to update stats Call into offloaded filters to update stats. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + net/sched/cls_bpf.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 57af9f3032ff..5ccaa4be7d96 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -490,6 +490,7 @@ enum tc_clsbpf_command { TC_CLSBPF_ADD, TC_CLSBPF_REPLACE, TC_CLSBPF_DESTROY, + TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1becc2fe1bc5..bb1d5a487081 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -221,6 +221,15 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp, prog->offloaded = false; } +static void cls_bpf_offload_update_stats(struct tcf_proto *tp, + struct cls_bpf_prog *prog) +{ + if (!prog->offloaded) + return; + + cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS); +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -577,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, tm->tcm_handle = prog->handle; + cls_bpf_offload_update_stats(tp, prog); + nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; -- cgit v1.2.3 From cf1a6474f80735ff4a5d99f3dd68a94dbec8455f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:41:53 +0100 Subject: rxrpc: Add per-peer RTT tracker Add a function to track the average RTT for a peer. Sources of RTT data will be added in subsequent patches. The RTT data will be useful in the future for determining resend timeouts and for handling the slow-start part of the Rx protocol. Also add a pair of tracepoints, one to log transmissions to elicit a response for RTT purposes and one to log responses that contribute RTT data. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 61 ++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 25 +++++++++++++++--- net/rxrpc/misc.c | 8 ++++++ net/rxrpc/peer_event.c | 41 +++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 75a5d8bf50e1..e8f2afbbe0bf 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -353,6 +353,67 @@ TRACE_EVENT(rxrpc_recvmsg, __entry->ret) ); +TRACE_EVENT(rxrpc_rtt_tx, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why, + rxrpc_serial_t send_serial), + + TP_ARGS(call, why, send_serial), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_rtt_tx_trace, why ) + __field(rxrpc_serial_t, send_serial ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->send_serial = send_serial; + ), + + TP_printk("c=%p %s sr=%08x", + __entry->call, + rxrpc_rtt_tx_traces[__entry->why], + __entry->send_serial) + ); + +TRACE_EVENT(rxrpc_rtt_rx, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + s64 rtt, u8 nr, s64 avg), + + TP_ARGS(call, why, send_serial, resp_serial, rtt, nr, avg), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_rtt_rx_trace, why ) + __field(u8, nr ) + __field(rxrpc_serial_t, send_serial ) + __field(rxrpc_serial_t, resp_serial ) + __field(s64, rtt ) + __field(u64, avg ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->send_serial = send_serial; + __entry->resp_serial = resp_serial; + __entry->rtt = rtt; + __entry->nr = nr; + __entry->avg = avg; + ), + + TP_printk("c=%p %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld", + __entry->call, + rxrpc_rtt_rx_traces[__entry->why], + __entry->send_serial, + __entry->resp_serial, + __entry->rtt, + __entry->nr, + __entry->avg) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index dcf54e3fb478..79c671e552c3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -258,10 +258,11 @@ struct rxrpc_peer { /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 - suseconds_t rtt; /* current RTT estimate (in uS) */ - unsigned int rtt_point; /* next entry at which to insert */ - unsigned int rtt_usage; /* amount of cache actually used */ - suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ + u64 rtt; /* Current RTT estimate (in nS) */ + u64 rtt_sum; /* Sum of cache contents */ + u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ + u8 rtt_cursor; /* next entry at which to insert */ + u8 rtt_usage; /* amount of cache actually used */ }; /* @@ -657,6 +658,20 @@ enum rxrpc_recvmsg_trace { extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; +enum rxrpc_rtt_tx_trace { + rxrpc_rtt_tx_ping, + rxrpc_rtt_tx__nr_trace +}; + +extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5]; + +enum rxrpc_rtt_rx_trace { + rxrpc_rtt_rx_ping_response, + rxrpc_rtt_rx__nr_trace +}; + +extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); @@ -955,6 +970,8 @@ void rxrpc_reject_packets(struct rxrpc_local *); */ void rxrpc_error_report(struct sock *); void rxrpc_peer_error_distributor(struct work_struct *); +void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, + rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); /* * peer_object.c diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 026e1f2e83ff..6321c23f9a6e 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -182,3 +182,11 @@ const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { [rxrpc_recvmsg_to_be_accepted] = "TBAC", [rxrpc_recvmsg_return] = "RETN", }; + +const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = { + [rxrpc_rtt_tx_ping] = "PING", +}; + +const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { + [rxrpc_rtt_rx_ping_response] = "PONG", +}; diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 18276e7cb9e0..bf13b8470c9a 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -305,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work) rxrpc_put_peer(peer); _leave(""); } + +/* + * Add RTT information to cache. This is called in softirq mode and has + * exclusive access to the peer RTT data. + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time) +{ + struct rxrpc_peer *peer = call->peer; + s64 rtt; + u64 sum = peer->rtt_sum, avg; + u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage; + + rtt = ktime_to_ns(ktime_sub(resp_time, send_time)); + if (rtt < 0) + return; + + /* Replace the oldest datum in the RTT buffer */ + sum -= peer->rtt_cache[cursor]; + sum += rtt; + peer->rtt_cache[cursor] = rtt; + peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1); + peer->rtt_sum = sum; + if (usage < RXRPC_RTT_CACHE_SIZE) { + usage++; + peer->rtt_usage = usage; + } + + /* Now recalculate the average */ + if (usage == RXRPC_RTT_CACHE_SIZE) { + avg = sum / RXRPC_RTT_CACHE_SIZE; + } else { + avg = sum; + do_div(avg, usage); + } + + peer->rtt = avg; + trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, + usage, avg); +} -- cgit v1.2.3 From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:09 +0300 Subject: net: skbuff: Export __skb_vlan_pop This exports the functionality of extracting the tag from the payload, without moving next vlan tag into hw accel tag. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6dab3f7457c..9bf60b556bd2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3085,6 +3085,7 @@ bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7bf82a28e10a..6c22351bd519 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); -/* remove VLAN header from packet and update csum accordingly. */ -static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; unsigned int offset = skb->data - skb_mac_header(skb); @@ -4554,6 +4556,7 @@ pull: return err; } +EXPORT_SYMBOL(__skb_vlan_pop); int skb_vlan_pop(struct sk_buff *skb) { -- cgit v1.2.3 From 45a497f2d149a4a8061c61518a79d59f1f3034b2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:10 +0300 Subject: net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action TCA_VLAN_ACT_MODIFY allows one to change an existing tag. It accepts same attributes as TCA_VLAN_ACT_PUSH (protocol, id, priority). If packet is vlan tagged, then the tag gets overwritten according to user specified attributes. For example, this allows user to replace a tag's vid while preserving its priority bits (as opposed to "action vlan pop pipe action vlan push"). Signed-off-by: Shmulik Ladkani Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_vlan.h | 1 + net/sched/act_vlan.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index be72b6e3843b..bddb272b843f 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -16,6 +16,7 @@ #define TCA_VLAN_ACT_POP 1 #define TCA_VLAN_ACT_PUSH 2 +#define TCA_VLAN_ACT_MODIFY 3 struct tc_vlan { tc_gen; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 59a8d3150ae2..a95c00b119da 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -30,6 +30,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_vlan *v = to_vlan(a); int action; int err; + u16 tci; spin_lock(&v->tcf_lock); tcf_lastuse_update(&v->tcf_tm); @@ -48,6 +49,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (err) goto drop; break; + case TCA_VLAN_ACT_MODIFY: + /* No-op if no vlan tag (either hw-accel or in-payload) */ + if (!skb_vlan_tagged(skb)) + goto unlock; + /* extract existing tag (and guarantee no hw-accel tag) */ + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + skb->vlan_tci = 0; + } else { + /* in-payload vlan tag, pop it */ + err = __skb_vlan_pop(skb, &tci); + if (err) + goto drop; + } + /* replace the vid */ + tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid; + /* replace prio bits, if tcfv_push_prio specified */ + if (v->tcfv_push_prio) { + tci &= ~VLAN_PRIO_MASK; + tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT; + } + /* put updated tci as hwaccel tag */ + __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci); + break; default: BUG(); } @@ -102,6 +127,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: + case TCA_VLAN_ACT_MODIFY: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_hash_release(*a, bind); @@ -185,7 +211,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if (v->tcfv_action == TCA_VLAN_ACT_PUSH && + if ((v->tcfv_action == TCA_VLAN_ACT_PUSH || + v->tcfv_action == TCA_VLAN_ACT_MODIFY) && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto) || -- cgit v1.2.3 From efee95f42b5dddedcaff0a0eaa44e170fc7522e8 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 20 Sep 2016 19:25:58 -0400 Subject: ptp_clock: future-proofing drivers against PTP subsystem becoming optional Drivers must be ready to accept NULL from ptp_clock_register() if the PTP clock subsystem is configured out. This patch documents that and ensures that all drivers cope well with a NULL return. Signed-off-by: Nicolas Pitre Reviewed-by: Eugenia Emantayev Acked-by: Richard Cochran Acked-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/e1000e/ptp.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_ptp.c | 2 +- drivers/net/ethernet/intel/igb/igb_ptp.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_clock.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 2 +- drivers/net/ethernet/sfc/ptp.c | 14 +++++++------- drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 2 +- include/linux/ptp_clock_kernel.h | 5 +++++ 9 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c index 2e1b17ad52a3..ad03763e009a 100644 --- a/drivers/net/ethernet/intel/e1000e/ptp.c +++ b/drivers/net/ethernet/intel/e1000e/ptp.c @@ -334,7 +334,7 @@ void e1000e_ptp_init(struct e1000_adapter *adapter) if (IS_ERR(adapter->ptp_clock)) { adapter->ptp_clock = NULL; e_err("ptp_clock_register failed\n"); - } else { + } else if (adapter->ptp_clock) { e_info("registered PHC clock\n"); } } diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index ed39cbad24bd..f1feceab758a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -669,7 +669,7 @@ void i40e_ptp_init(struct i40e_pf *pf) pf->ptp_clock = NULL; dev_err(&pf->pdev->dev, "%s: ptp_clock_register failed\n", __func__); - } else { + } else if (pf->ptp_clock) { struct timespec64 ts; u32 regval; diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 66dfa2085cc7..1dd14e166dc8 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -1159,7 +1159,7 @@ void igb_ptp_init(struct igb_adapter *adapter) if (IS_ERR(adapter->ptp_clock)) { adapter->ptp_clock = NULL; dev_err(&adapter->pdev->dev, "ptp_clock_register failed\n"); - } else { + } else if (adapter->ptp_clock) { dev_info(&adapter->pdev->dev, "added PHC on %s\n", adapter->netdev->name); adapter->ptp_flags |= IGB_PTP_ENABLED; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c index e5431bfe3339..a92277683a64 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c @@ -1254,7 +1254,7 @@ static long ixgbe_ptp_create_clock(struct ixgbe_adapter *adapter) adapter->ptp_clock = NULL; e_dev_err("ptp_clock_register failed\n"); return err; - } else + } else if (adapter->ptp_clock) e_dev_info("registered PHC device on %s\n", netdev->name); /* set default timestamp mode to disabled here. We do this in diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c index 1494997c4f7e..08fc5fc56d43 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c @@ -298,7 +298,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) if (IS_ERR(mdev->ptp_clock)) { mdev->ptp_clock = NULL; mlx4_err(mdev, "ptp_clock_register failed\n"); - } else { + } else if (mdev->ptp_clock) { mlx4_info(mdev, "registered PHC clock\n"); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index 847a8f3ac2b2..13dc388667b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -273,7 +273,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) tstamp->ptp = ptp_clock_register(&tstamp->ptp_info, &priv->mdev->pdev->dev); - if (IS_ERR_OR_NULL(tstamp->ptp)) { + if (IS_ERR(tstamp->ptp)) { mlx5_core_warn(priv->mdev, "ptp_clock_register failed %ld\n", PTR_ERR(tstamp->ptp)); tstamp->ptp = NULL; diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index dd204d9704c6..77a5364f7a10 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -1269,13 +1269,13 @@ int efx_ptp_probe(struct efx_nic *efx, struct efx_channel *channel) if (IS_ERR(ptp->phc_clock)) { rc = PTR_ERR(ptp->phc_clock); goto fail3; - } - - INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker); - ptp->pps_workwq = create_singlethread_workqueue("sfc_pps"); - if (!ptp->pps_workwq) { - rc = -ENOMEM; - goto fail4; + } else if (ptp->phc_clock) { + INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker); + ptp->pps_workwq = create_singlethread_workqueue("sfc_pps"); + if (!ptp->pps_workwq) { + rc = -ENOMEM; + goto fail4; + } } } ptp->nic_ts_enabled = false; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 170a18b61281..6e3b82972ce8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -187,7 +187,7 @@ int stmmac_ptp_register(struct stmmac_priv *priv) if (IS_ERR(priv->ptp_clock)) { priv->ptp_clock = NULL; pr_err("ptp_clock_register() failed on %s\n", priv->dev->name); - } else + } else if (priv->ptp_clock) pr_debug("Added PTP HW clock successfully on %s\n", priv->dev->name); diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 6b15e168148a..5ad54fc66cf0 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -127,6 +127,11 @@ struct ptp_clock; * * @info: Structure describing the new clock. * @parent: Pointer to the parent device of the new clock. + * + * Returns a valid pointer on success or PTR_ERR on failure. If PHC + * support is missing at the configuration level, this function + * returns NULL, and drivers are expected to gracefully handle that + * case separately. */ extern struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, -- cgit v1.2.3 From e2f036a97271cf5811ee754bf321a29a814577f9 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 21 Sep 2016 08:45:55 -0300 Subject: sctp: rename WORD_TRUNC/ROUND macros To something more meaningful these days, specially because this is working on packet headers or lengths and which are not tied to any CPU arch but to the protocol itself. So, WORD_TRUNC becomes SCTP_TRUNC4 and WORD_ROUND becomes SCTP_PAD4. Reported-by: David Laight Reported-by: David Miller Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 10 +++++----- net/netfilter/xt_sctp.c | 2 +- net/sctp/associola.c | 2 +- net/sctp/chunk.c | 6 +++--- net/sctp/input.c | 8 ++++---- net/sctp/inqueue.c | 2 +- net/sctp/output.c | 12 ++++++------ net/sctp/sm_make_chunk.c | 28 ++++++++++++++-------------- net/sctp/sm_statefuns.c | 6 +++--- net/sctp/transport.c | 4 ++-- net/sctp/ulpevent.c | 4 ++-- 11 files changed, 42 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 632e205ca54b..87a7f42e7639 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -83,9 +83,9 @@ #endif /* Round an int up to the next multiple of 4. */ -#define WORD_ROUND(s) (((s)+3)&~3) +#define SCTP_PAD4(s) (((s)+3)&~3) /* Truncate to the previous multiple of 4. */ -#define WORD_TRUNC(s) ((s)&~3) +#define SCTP_TRUNC4(s) ((s)&~3) /* * Function declarations. @@ -433,7 +433,7 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu) if (asoc->user_frag) frag = min_t(int, frag, asoc->user_frag); - frag = WORD_TRUNC(min_t(int, frag, SCTP_MAX_CHUNK_LEN)); + frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN)); return frag; } @@ -462,7 +462,7 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) for (pos.v = chunk->member;\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\ - pos.v += WORD_ROUND(ntohs(pos.p->length))) + pos.v += SCTP_PAD4(ntohs(pos.p->length))) #define sctp_walk_errors(err, chunk_hdr)\ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) @@ -472,7 +472,7 @@ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ sizeof(sctp_chunkhdr_t));\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(sctp_errhdr_t); \ - err = (sctp_errhdr_t *)((void *)err + WORD_ROUND(ntohs(err->length)))) + err = (sctp_errhdr_t *)((void *)err + SCTP_PAD4(ntohs(err->length)))) #define sctp_walk_fwdtsn(pos, chunk)\ _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk)) diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index ef36a56a02c6..4dedb96d1a06 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb, ++i, offset, sch->type, htons(sch->length), sch->flags); #endif - offset += WORD_ROUND(ntohs(sch->length)); + offset += SCTP_PAD4(ntohs(sch->length)); pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 1c23060c41a6..f10d3397f917 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) transports) { if (t->pmtu_pending && t->dst) { sctp_transport_update_pmtu(sk, t, - WORD_TRUNC(dst_mtu(t->dst))); + SCTP_TRUNC4(dst_mtu(t->dst))); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index af9cc8055465..76eae828ec89 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -208,8 +208,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) - max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) + - hmac_desc->hmac_len); + max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) + + hmac_desc->hmac_len); } /* Now, check if we need to reduce our max */ @@ -229,7 +229,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max) - max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t)); + max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) diff --git a/net/sctp/input.c b/net/sctp/input.c index 69444d32ecda..a1d85065bfc0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) /* PMTU discovery (RFC1191) */ if (ICMP_FRAG_NEEDED == code) { sctp_icmp_frag_needed(sk, asoc, transport, - WORD_TRUNC(info)); + SCTP_TRUNC4(info)); goto out_unlock; } else { if (ICMP_PROT_UNREACH == code) { @@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = offset + WORD_ROUND(ntohs(ch->length)); + ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; @@ -1121,7 +1121,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; @@ -1190,7 +1190,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ - if (WORD_ROUND(ntohs(ch->length)) > skb->len) + if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 6437aa97cfd7..f731de3e8428 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -213,7 +213,7 @@ new_skb: } chunk->chunk_hdr = ch; - chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index 0c605ec74dc4..2a5c1896d18f 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -297,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { sctp_xmit_t retval = SCTP_XMIT_OK; - __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length)); + __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); @@ -508,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) if (gso) { pkt_size = packet->overhead; list_for_each_entry(chunk, &packet->chunk_list, list) { - int padded = WORD_ROUND(chunk->skb->len); + int padded = SCTP_PAD4(chunk->skb->len); if (pkt_size + padded > tp->pathmtu) break; @@ -538,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * included in the chunk length field. The sender should * never pad with more than 3 bytes. * - * [This whole comment explains WORD_ROUND() below.] + * [This whole comment explains SCTP_PAD4() below.] */ pkt_size -= packet->overhead; @@ -560,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) has_data = 1; } - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) memset(skb_put(chunk->skb, padding), 0, padding); @@ -587,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * acknowledged or have failed. * Re-queue auth chunks if needed. */ - pkt_size -= WORD_ROUND(chunk->skb->len); + pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); @@ -911,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, */ maxsize = pmtu - packet->overhead; if (packet->auth) - maxsize -= WORD_ROUND(packet->auth->skb->len); + maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8c77b87a8565..79dd66079dd7 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; - chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); + chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); if (asoc->prsctp_enable) @@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* Add HMACS parameter length if any were defined */ auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* If we have any extensions to report, account for that */ if (num_ext) - chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * @@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, } if (num_ext) - chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); @@ -1390,7 +1390,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, struct sock *sk; /* No need to allocate LL here, as this is only a chunk. */ - skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp); + skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp); if (!skb) goto nodata; @@ -1482,7 +1482,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) void *target; void *padding; int chunklen = ntohs(chunk->chunk_hdr->length); - int padlen = WORD_ROUND(chunklen) - chunklen; + int padlen = SCTP_PAD4(chunklen) - chunklen; padding = skb_put(chunk->skb, padlen); target = skb_put(chunk->skb, len); @@ -1900,7 +1900,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc, struct __sctp_missing report; __u16 len; - len = WORD_ROUND(sizeof(report)); + len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. @@ -2098,9 +2098,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, if (*errp) { if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, - WORD_ROUND(ntohs(param.p->length)))) + SCTP_PAD4(ntohs(param.p->length)))) sctp_addto_chunk_fixed(*errp, - WORD_ROUND(ntohs(param.p->length)), + SCTP_PAD4(ntohs(param.p->length)), param.v); } else { /* If there is no memory for generating the ERROR diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d88bb2b0b699..026e3bca4a94 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net, } /* Report violation if chunk len overflows */ - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, hdr = unk_chunk->chunk_hdr; err_chunk = sctp_make_op_error(asoc, unk_chunk, SCTP_ERROR_UNKNOWN_CHUNK, hdr, - WORD_ROUND(ntohs(hdr->length)), + SCTP_PAD4(ntohs(hdr->length)), 0); if (err_chunk) { sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, @@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, hdr = unk_chunk->chunk_hdr; err_chunk = sctp_make_op_error(asoc, unk_chunk, SCTP_ERROR_UNKNOWN_CHUNK, hdr, - WORD_ROUND(ntohs(hdr->length)), + SCTP_PAD4(ntohs(hdr->length)), 0); if (err_chunk) { sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 81b86678be4d..ce54dce13ddb 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) } if (transport->dst) { - transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); + transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); } else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } @@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport, return; } if (transport->dst) { - transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); + transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d85b803da11d..bea00058ce35 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, ch = (sctp_errhdr_t *)(chunk->skb->data); cause = ch->cause; - elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t); + elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t); /* Pull off the ERROR header. */ skb_pull(chunk->skb, sizeof(sctp_errhdr_t)); @@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, * MUST ignore the padding bytes. */ len = ntohs(chunk->chunk_hdr->length); - padding = WORD_ROUND(len) - len; + padding = SCTP_PAD4(len) - len; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); -- cgit v1.2.3 From 77f2efcbdd7133466060198e02c6e8a170c3cd14 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:01 +0100 Subject: rxrpc: Add ktime_sub_ms() Add a ktime_sub_ms() to go with ktime_add_ms() and co. for use in AF_RXRPC RTT determination. Signed-off-by: David Howells --- include/linux/ktime.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2b6a204bd8d4..aa118bad1407 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -231,6 +231,11 @@ static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } +static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) +{ + return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); +} + extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** -- cgit v1.2.3 From 572de608e36279f249c9a6350f142e69f23dacab Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Thu, 22 Sep 2016 10:33:54 +0800 Subject: net: ethernet: mediatek: add extension of phy-mode for TRGMII adds PHY-mode "trgmii" as an extension for the operation mode of the PHY interface for PHY_INTERFACE_MODE_TRGMII. and adds a variable trgmii inside mtk_mac as the indication to make the difference between the MAC connected to internal switch or connected to external PHY by the given configuration on the board and then to perform the corresponding setup on TRGMII hardware module. Signed-off-by: Sean Wang Cc: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 ++ drivers/net/ethernet/mediatek/mtk_eth_soc.h | 3 +++ include/linux/phy.h | 3 +++ 3 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 2909372c4da0..e873e21fd20e 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -244,6 +244,8 @@ static int mtk_phy_connect(struct mtk_mac *mac) return -ENODEV; switch (of_get_phy_mode(np)) { + case PHY_INTERFACE_MODE_TRGMII: + mac->trgmii = true; case PHY_INTERFACE_MODE_RGMII_TXID: case PHY_INTERFACE_MODE_RGMII_RXID: case PHY_INTERFACE_MODE_RGMII_ID: diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 7c5e534d2120..e3b9525f94c6 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -529,6 +529,8 @@ struct mtk_eth { * @hw: Backpointer to our main datastruture * @hw_stats: Packet statistics counter * @phy_dev: The attached PHY if available + * @trgmii Indicate if the MAC uses TRGMII connected to internal + switch */ struct mtk_mac { int id; @@ -539,6 +541,7 @@ struct mtk_mac { struct phy_device *phy_dev; __be32 hwlro_ip[MTK_MAX_LRO_IP_CNT]; int hwlro_ip_cnt; + bool trgmii; }; /* the struct describing the SoC. these are declared in the soc_xyz.c files */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 2d24b283aa2d..e25f1830fbcf 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -80,6 +80,7 @@ typedef enum { PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_QSGMII, + PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -123,6 +124,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "moca"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; + case PHY_INTERFACE_MODE_TRGMII: + return "trgmii"; default: return "unknown"; } -- cgit v1.2.3 From 2b03bf732488a3c2e920afe22c03b82cb8477e28 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 13 Sep 2016 13:49:53 +0200 Subject: netfilter: nft_numgen: add number generation offset Add support of an offset value for incremental counter and random. With this option the sysadmin is able to start the counter to a certain value and then apply the generated number. Example: meta mark set numgen inc mod 2 offset 100 This will generate marks with the serie 100, 101, 100, 101, ... Suggested-by: Pablo Neira Ayuso Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_numgen.c | 32 ++++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index bc0eb6a1066d..bcfb892ff148 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1136,12 +1136,14 @@ enum nft_trace_types { * @NFTA_NG_DREG: destination register (NLA_U32) * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) * @NFTA_NG_TYPE: operation type (NLA_U32) + * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32) */ enum nft_ng_attributes { NFTA_NG_UNSPEC, NFTA_NG_DREG, NFTA_NG_MODULUS, NFTA_NG_TYPE, + NFTA_NG_OFFSET, __NFTA_NG_MAX }; #define NFTA_NG_MAX (__NFTA_NG_MAX - 1) diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index f173ebec30a7..55bc5ab78d4a 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -23,6 +23,7 @@ struct nft_ng_inc { enum nft_registers dreg:8; u32 modulus; atomic_t counter; + u32 offset; }; static void nft_ng_inc_eval(const struct nft_expr *expr, @@ -37,13 +38,14 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); - regs->data[priv->dreg] = nval; + regs->data[priv->dreg] = nval + priv->offset; } static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, + [NFTA_NG_OFFSET] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -52,10 +54,16 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, { struct nft_ng_inc *priv = nft_expr_priv(expr); + if (tb[NFTA_NG_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); if (priv->modulus == 0) return -ERANGE; + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); atomic_set(&priv->counter, 0); @@ -64,7 +72,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, - u32 modulus, enum nft_ng_types type) + u32 modulus, enum nft_ng_types type, u32 offset) { if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) goto nla_put_failure; @@ -72,6 +80,8 @@ static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset))) + goto nla_put_failure; return 0; @@ -83,12 +93,14 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_inc *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL, + priv->offset); } struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; + u32 offset; }; static void nft_ng_random_eval(const struct nft_expr *expr, @@ -97,9 +109,10 @@ static void nft_ng_random_eval(const struct nft_expr *expr, { struct nft_ng_random *priv = nft_expr_priv(expr); struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); + u32 val; - regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state), - priv->modulus); + val = reciprocal_scale(prandom_u32_state(state), priv->modulus); + regs->data[priv->dreg] = val + priv->offset; } static int nft_ng_random_init(const struct nft_ctx *ctx, @@ -108,10 +121,16 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, { struct nft_ng_random *priv = nft_expr_priv(expr); + if (tb[NFTA_NG_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); if (priv->modulus == 0) return -ERANGE; + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + prandom_init_once(&nft_numgen_prandom_state); priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); @@ -124,7 +143,8 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM, + priv->offset); } static struct nft_expr_type nft_ng_type; -- cgit v1.2.3 From 36b701fae12ac763a568037e4e7c96b5727a8b3e Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Wed, 14 Sep 2016 15:00:02 +0200 Subject: netfilter: nf_tables: validate maximum value of u32 netlink attributes Fetch value and validate u32 netlink attribute. This validation is usually required when the u32 netlink attributes are being stored in a field whose size is smaller. This patch revisits 4da449ae1df9 ("netfilter: nft_exthdr: Add size check on u8 nft_exthdr attributes"). Fixes: 96518518cc41 ("netfilter: add nftables") Suggested-by: Pablo Neira Ayuso Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 25 +++++++++++++++++++++++++ net/netfilter/nft_bitwise.c | 8 +++++++- net/netfilter/nft_byteorder.c | 15 +++++++++++++-- net/netfilter/nft_cmp.c | 3 +++ net/netfilter/nft_exthdr.c | 12 +++++++----- net/netfilter/nft_immediate.c | 4 ++++ 7 files changed, 60 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a7a7cebc8d07..5031e072567b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -145,6 +145,7 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } +unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); unsigned int nft_parse_register(const struct nlattr *attr); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bd9715e5ff26..b70d3ea1430e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4409,6 +4409,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, return 0; } +/** + * nft_parse_u32_check - fetch u32 attribute and check for maximum value + * + * @attr: netlink attribute to fetch value from + * @max: maximum value to be stored in dest + * @dest: pointer to the variable + * + * Parse, check and store a given u32 netlink attribute into variable. + * This function returns -ERANGE if the value goes over maximum value. + * Otherwise a 0 is returned and the attribute value is stored in the + * destination variable. + */ +unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) +{ + int val; + + val = ntohl(nla_get_be32(attr)); + if (val > max) + return -ERANGE; + + *dest = val; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_u32_check); + /** * nft_parse_register - parse a register value from a netlink attribute * diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index d71cc18fa35d..31c15ed2e5fc 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, { struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_data_desc d1, d2; + u32 len; int err; if (tb[NFTA_BITWISE_SREG] == NULL || @@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_XOR] == NULL) return -EINVAL; - priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index b78c28ba465f..ee63d981268d 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_byteorder *priv = nft_expr_priv(expr); + u32 size, len; int err; if (tb[NFTA_BYTEORDER_SREG] == NULL || @@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); + err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size); + if (err < 0) + return err; + + priv->size = size; + switch (priv->size) { case 2: case 4: @@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, } priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); - priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); + err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index e25b35d70e4d..2e53739812b1 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -84,6 +84,9 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; + if (desc.len > U8_MAX) + return -ERANGE; + priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); priv->len = desc.len; return 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 82c264e40278..a84cf3d66056 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,7 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 offset, len; + u32 offset, len, err; if (tb[NFTA_EXTHDR_DREG] == NULL || tb[NFTA_EXTHDR_TYPE] == NULL || @@ -67,11 +67,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, tb[NFTA_EXTHDR_LEN] == NULL) return -EINVAL; - offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); + if (err < 0) + return err; - if (offset > U8_MAX || len > U8_MAX) - return -ERANGE; + err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); + if (err < 0) + return err; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index db3b746858e3..d17018ff54e6 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -53,6 +53,10 @@ static int nft_immediate_init(const struct nft_ctx *ctx, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; + + if (desc.len > U8_MAX) + return -ERANGE; + priv->dlen = desc.len; priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); -- cgit v1.2.3 From 8061bb54436c19fd16b7c734a69ff60bac26e3e9 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 14 Sep 2016 23:41:46 +0800 Subject: netfilter: nft_queue: add _SREG_QNUM attr to select the queue number Currently, the user can specify the queue numbers by _QUEUE_NUM and _QUEUE_TOTAL attributes, this is enough in most situations. But acctually, it is not very flexible, for example: tcp dport 80 mapped to queue0 tcp dport 81 mapped to queue1 tcp dport 82 mapped to queue2 In order to do this thing, we must add 3 nft rules, and more mapping meant more rules ... So take one register to select the queue number, then we can add one simple rule to mapping queues, maybe like this: queue num tcp dport map { 80:0, 81:1, 82:2 ... } Florian Westphal also proposed wider usage scenarios: queue num jhash ip saddr . ip daddr mod ... queue num meta cpu ... queue num meta mark ... The last point is how to load a queue number from sreg, although we can use *(u16*)®s->data[reg] to load the queue number, just like nat expr to load its l4port do. But we will cooperate with hash expr, meta cpu, meta mark expr and so on. They all store the result to u32 type, so cast it to u16 pointer and dereference it will generate wrong result in the big endian system. So just keep it simple, we treat queue number as u32 type, although u16 type is already enough. Suggested-by: Pablo Neira Ayuso Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nft_queue.c | 102 +++++++++++++++++++++++++++---- 2 files changed, 92 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index bcfb892ff148..1cf41dd838b2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -894,12 +894,14 @@ enum nft_log_attributes { * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16) * @NFTA_QUEUE_TOTAL: number of queues to load balance packets on (NLA_U16) * @NFTA_QUEUE_FLAGS: various flags (NLA_U16) + * @NFTA_QUEUE_SREG_QNUM: source register of queue number (NLA_U32: nft_registers) */ enum nft_queue_attributes { NFTA_QUEUE_UNSPEC, NFTA_QUEUE_NUM, NFTA_QUEUE_TOTAL, NFTA_QUEUE_FLAGS, + NFTA_QUEUE_SREG_QNUM, __NFTA_QUEUE_MAX }; #define NFTA_QUEUE_MAX (__NFTA_QUEUE_MAX - 1) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index d16d59959ff6..393d359a1889 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -22,9 +22,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - u16 queuenum; - u16 queues_total; - u16 flags; + enum nft_registers sreg_qnum:8; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -54,26 +55,39 @@ static void nft_queue_eval(const struct nft_expr *expr, regs->verdict.code = ret; } +static void nft_queue_sreg_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_queue *priv = nft_expr_priv(expr); + u32 queue, ret; + + queue = regs->data[priv->sreg_qnum]; + + ret = NF_QUEUE_NR(queue); + if (priv->flags & NFT_QUEUE_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + regs->verdict.code = ret; +} + static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, + [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 }, }; static int nft_queue_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); u32 maxid; - if (tb[NFTA_QUEUE_NUM] == NULL) - return -EINVAL; - - init_hashrandom(&jhash_initval); priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); - if (tb[NFTA_QUEUE_TOTAL] != NULL) + if (tb[NFTA_QUEUE_TOTAL]) priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); else priv->queues_total = 1; @@ -85,11 +99,34 @@ static int nft_queue_init(const struct nft_ctx *ctx, if (maxid > U16_MAX) return -ERANGE; - if (tb[NFTA_QUEUE_FLAGS] != NULL) { + if (tb[NFTA_QUEUE_FLAGS]) { + priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); + if (priv->flags & ~NFT_QUEUE_FLAG_MASK) + return -EINVAL; + } + return 0; +} + +static int nft_queue_sreg_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_queue *priv = nft_expr_priv(expr); + int err; + + priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); + err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + if (err < 0) + return err; + + if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; + if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) + return -EOPNOTSUPP; } + return 0; } @@ -108,6 +145,21 @@ nla_put_failure: return -1; } +static int +nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_queue *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) || + nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + static struct nft_expr_type nft_queue_type; static const struct nft_expr_ops nft_queue_ops = { .type = &nft_queue_type, @@ -117,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = { .dump = nft_queue_dump, }; +static const struct nft_expr_ops nft_queue_sreg_ops = { + .type = &nft_queue_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), + .eval = nft_queue_sreg_eval, + .init = nft_queue_sreg_init, + .dump = nft_queue_sreg_dump, +}; + +static const struct nft_expr_ops * +nft_queue_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM]) + return ERR_PTR(-EINVAL); + + init_hashrandom(&jhash_initval); + + if (tb[NFTA_QUEUE_NUM]) + return &nft_queue_ops; + + if (tb[NFTA_QUEUE_SREG_QNUM]) + return &nft_queue_sreg_ops; + + return ERR_PTR(-EINVAL); +} + static struct nft_expr_type nft_queue_type __read_mostly = { .name = "queue", - .ops = &nft_queue_ops, + .select_ops = &nft_queue_select_ops, .policy = nft_queue_policy, .maxattr = NFTA_QUEUE_MAX, .owner = THIS_MODULE, -- cgit v1.2.3 From 2462f3f4a7e079192b78f36900c34f18dad824a7 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Thu, 15 Sep 2016 20:50:16 +0800 Subject: netfilter: nf_queue: improve queue range support for bridge family After commit ac2863445686 ("netfilter: bridge: add nf_afinfo to enable queuing to userspace"), we can queue packets to the user space in bridge family. But when the user specify the queue range, packets will be only delivered to the first queue num. Because in nfqueue_hash, we only support ipv4 and ipv6 family. Now add support for bridge family too. Suggested-by: Pablo Neira Ayuso Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 56 ++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index cc8a11f7e306..903dca050fb6 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -41,22 +41,19 @@ static inline void init_hashrandom(u32 *jhash_initval) *jhash_initval = prandom_u32(); } -static inline u32 hash_v4(const struct sk_buff *skb, u32 jhash_initval) +static inline u32 hash_v4(const struct iphdr *iph, u32 initval) { - const struct iphdr *iph = ip_hdr(skb); - /* packets in either direction go into same queue */ if ((__force u32)iph->saddr < (__force u32)iph->daddr) return jhash_3words((__force u32)iph->saddr, - (__force u32)iph->daddr, iph->protocol, jhash_initval); + (__force u32)iph->daddr, iph->protocol, initval); return jhash_3words((__force u32)iph->daddr, - (__force u32)iph->saddr, iph->protocol, jhash_initval); + (__force u32)iph->saddr, iph->protocol, initval); } -static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval) +static inline u32 hash_v6(const struct ipv6hdr *ip6h, u32 initval) { - const struct ipv6hdr *ip6h = ipv6_hdr(skb); u32 a, b, c; if ((__force u32)ip6h->saddr.s6_addr32[3] < @@ -74,17 +71,50 @@ static inline u32 hash_v6(const struct sk_buff *skb, u32 jhash_initval) else c = (__force u32) ip6h->daddr.s6_addr32[1]; - return jhash_3words(a, b, c, jhash_initval); + return jhash_3words(a, b, c, initval); +} + +static inline u32 hash_bridge(const struct sk_buff *skb, u32 initval) +{ + struct ipv6hdr *ip6h, _ip6h; + struct iphdr *iph, _iph; + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + iph = skb_header_pointer(skb, skb_network_offset(skb), + sizeof(*iph), &_iph); + if (iph) + return hash_v4(iph, initval); + break; + case htons(ETH_P_IPV6): + ip6h = skb_header_pointer(skb, skb_network_offset(skb), + sizeof(*ip6h), &_ip6h); + if (ip6h) + return hash_v6(ip6h, initval); + break; + } + + return 0; } static inline u32 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, - u32 jhash_initval) + u32 initval) { - if (family == NFPROTO_IPV4) - queue += ((u64) hash_v4(skb, jhash_initval) * queues_total) >> 32; - else if (family == NFPROTO_IPV6) - queue += ((u64) hash_v6(skb, jhash_initval) * queues_total) >> 32; + switch (family) { + case NFPROTO_IPV4: + queue += reciprocal_scale(hash_v4(ip_hdr(skb), initval), + queues_total); + break; + case NFPROTO_IPV6: + queue += reciprocal_scale(hash_v6(ipv6_hdr(skb), initval), + queues_total); + break; + case NFPROTO_BRIDGE: + queue += reciprocal_scale(hash_bridge(skb, initval), + queues_total); + break; + } return queue; } -- cgit v1.2.3 From 182691d0998400f35ad304718024e60feaa864aa Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 20 Sep 2016 18:19:14 -0300 Subject: sctp: improve how SSN, TSN and ASCONF serial are compared Make it similar to time_before() macros: - easier to understand - make use of typecheck() to avoid working on unexpected variable types (made the issue on previous patch visible) - for _[lg]te versions, slighly faster, as the compiler used to generate a sequence of cmp/je/cmp/js instructions and now it's sub/test/jle (for _lte): Before, for sctp_outq_sack: if (primary->cacc.changeover_active) { 1f01: 80 b9 84 02 00 00 00 cmpb $0x0,0x284(%rcx) 1f08: 74 6e je 1f78 u8 clear_cycling = 0; if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { 1f0a: 8b 81 80 02 00 00 mov 0x280(%rcx),%eax return ((s) - (t)) & TSN_SIGN_BIT; } static inline int TSN_lte(__u32 s, __u32 t) { return ((s) == (t)) || (((s) - (t)) & TSN_SIGN_BIT); 1f10: 8b 7d bc mov -0x44(%rbp),%edi 1f13: 39 c7 cmp %eax,%edi 1f15: 74 25 je 1f3c 1f17: 39 f8 cmp %edi,%eax 1f19: 78 21 js 1f3c primary->cacc.changeover_active = 0; After: if (primary->cacc.changeover_active) { 1ee7: 80 b9 84 02 00 00 00 cmpb $0x0,0x284(%rcx) 1eee: 74 73 je 1f63 u8 clear_cycling = 0; if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { 1ef0: 8b 81 80 02 00 00 mov 0x280(%rcx),%eax 1ef6: 2b 45 b4 sub -0x4c(%rbp),%eax 1ef9: 85 c0 test %eax,%eax 1efb: 7e 26 jle 1f23 primary->cacc.changeover_active = 0; *_lt() generated pretty much the same code. Tested with gcc (GCC) 6.1.1 20160621. This patch also removes SSN_lte as it is not used and cleanups some comments. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 94 ++++++++++----------------------------------------- 1 file changed, 18 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index bafe2a0ab908..ca6c971dd74a 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -307,85 +307,27 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk) } /* Compare two TSNs */ +#define TSN_lt(a,b) \ + (typecheck(__u32, a) && \ + typecheck(__u32, b) && \ + ((__s32)((a) - (b)) < 0)) -/* RFC 1982 - Serial Number Arithmetic - * - * 2. Comparison - * Then, s1 is said to be equal to s2 if and only if i1 is equal to i2, - * in all other cases, s1 is not equal to s2. - * - * s1 is said to be less than s2 if, and only if, s1 is not equal to s2, - * and - * - * (i1 < i2 and i2 - i1 < 2^(SERIAL_BITS - 1)) or - * (i1 > i2 and i1 - i2 > 2^(SERIAL_BITS - 1)) - * - * s1 is said to be greater than s2 if, and only if, s1 is not equal to - * s2, and - * - * (i1 < i2 and i2 - i1 > 2^(SERIAL_BITS - 1)) or - * (i1 > i2 and i1 - i2 < 2^(SERIAL_BITS - 1)) - */ - -/* - * RFC 2960 - * 1.6 Serial Number Arithmetic - * - * Comparisons and arithmetic on TSNs in this document SHOULD use Serial - * Number Arithmetic as defined in [RFC1982] where SERIAL_BITS = 32. - */ - -enum { - TSN_SIGN_BIT = (1<<31) -}; - -static inline int TSN_lt(__u32 s, __u32 t) -{ - return ((s) - (t)) & TSN_SIGN_BIT; -} - -static inline int TSN_lte(__u32 s, __u32 t) -{ - return ((s) == (t)) || (((s) - (t)) & TSN_SIGN_BIT); -} +#define TSN_lte(a,b) \ + (typecheck(__u32, a) && \ + typecheck(__u32, b) && \ + ((__s32)((a) - (b)) <= 0)) /* Compare two SSNs */ - -/* - * RFC 2960 - * 1.6 Serial Number Arithmetic - * - * Comparisons and arithmetic on Stream Sequence Numbers in this document - * SHOULD use Serial Number Arithmetic as defined in [RFC1982] where - * SERIAL_BITS = 16. - */ -enum { - SSN_SIGN_BIT = (1<<15) -}; - -static inline int SSN_lt(__u16 s, __u16 t) -{ - return ((s) - (t)) & SSN_SIGN_BIT; -} - -static inline int SSN_lte(__u16 s, __u16 t) -{ - return ((s) == (t)) || (((s) - (t)) & SSN_SIGN_BIT); -} - -/* - * ADDIP 3.1.1 - * The valid range of Serial Number is from 0 to 4294967295 (2**32 - 1). Serial - * Numbers wrap back to 0 after reaching 4294967295. - */ -enum { - ADDIP_SERIAL_SIGN_BIT = (1<<31) -}; - -static inline int ADDIP_SERIAL_gte(__u32 s, __u32 t) -{ - return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT); -} +#define SSN_lt(a,b) \ + (typecheck(__u16, a) && \ + typecheck(__u16, b) && \ + ((__s16)((a) - (b)) < 0)) + +/* ADDIP 3.1.1 */ +#define ADDIP_SERIAL_gte(a,b) \ + (typecheck(__u32, a) && \ + typecheck(__u32, b) && \ + ((__s32)((b) - (a)) <= 0)) /* Check VTAG of the packet matches the sender's own tag. */ static inline int -- cgit v1.2.3 From fefa569a9d4bc4b7758c0fddd75bb0382c95da77 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Sep 2016 08:58:55 -0700 Subject: net_sched: sch_fq: account for schedule/timers drifts It looks like the following patch can make FQ very precise, even in VM or stressed hosts. It matters at high pacing rates. We take into account the difference between the time that was programmed when last packet was sent, and current time (a drift of tens of usecs is often observed) Add an EWMA of the unthrottle latency to help diagnostics. This latency is the difference between current time and oldest packet in delayed RB-tree. This accounts for the high resolution timer latency, but can be different under stress, as fq_check_throttled() can be opportunistically be called from a dequeue() called after an enqueue() for a different flow. Tested: // Start a 10Gbit flow $ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr & Before patch : $ sar -n DEV 10 5 | grep eth0 | grep Average Average: eth0 17106.04 756876.84 1102.75 1119049.02 0.00 0.00 0.52 After patch : $ sar -n DEV 10 5 | grep eth0 | grep Average Average: eth0 17867.00 800245.90 1151.77 1183172.12 0.00 0.00 0.52 A new iproute2 tc can output the 'unthrottle latency' : $ tc -s qd sh dev eth0 | grep latency 0 gc, 0 highprio, 32490767 throttled, 2382 ns latency Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 +- net/sched/sch_fq.c | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index f8e39dbaa781..df7451d35131 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -811,7 +811,7 @@ struct tc_fq_qd_stats { __u32 flows; __u32 inactive_flows; __u32 throttled_flows; - __u32 pad; + __u32 unthrottle_latency_ns; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 5dd929cc1423..18e752439f6f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -86,6 +86,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ u32 quantum; @@ -408,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, static void fq_check_throttled(struct fq_sched_data *q, u64 now) { + unsigned long sample; struct rb_node *p; if (q->time_next_delayed_flow > now) return; + /* Update unthrottle latency EWMA. + * This is cheap and can help diagnosing timer/latency problems. + */ + sample = (unsigned long)(now - q->time_next_delayed_flow); + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { struct fq_flow *f = container_of(p, struct fq_flow, rate_node); @@ -515,7 +524,12 @@ begin: len = NSEC_PER_SEC; q->stat_pkts_too_long++; } - + /* Account for schedule/timers drifts. + * f->time_next_packet was set when prior packet was sent, + * and current time (@now) can be too late by tens of us. + */ + if (f->time_next_packet) + len -= min(len/2, now - f->time_next_packet); f->time_next_packet = now + len; } out: @@ -787,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); q->flow_refill_delay = msecs_to_jiffies(40); q->flow_max_rate = ~0U; + q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; q->new_flows.first = NULL; q->old_flows.first = NULL; @@ -854,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.flows = q->flows; st.inactive_flows = q->inactive_flows; st.throttled_flows = q->throttled_flows; - st.pad = 0; - + st.unthrottle_latency_ns = min_t(unsigned long, + q->unthrottle_latency_ns, ~0U); sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); -- cgit v1.2.3 From 53e89941ba2a969c483aa29b907de9a823179297 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 22 Sep 2016 20:01:41 +0300 Subject: net_sched: act_vlan: add helper inlines to access tcf_vlan info Needed e.g for offloading drivers to pick the relevant attributes. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/tc_act/tc_vlan.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 6b835889ea30..48cca321ee6c 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -11,6 +11,7 @@ #define __NET_TC_VLAN_H #include +#include #define VLAN_F_POP 0x1 #define VLAN_F_PUSH 0x2 @@ -24,4 +25,28 @@ struct tcf_vlan { }; #define to_vlan(a) ((struct tcf_vlan *)a) +static inline bool is_tcf_vlan(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_VLAN) + return true; +#endif + return false; +} + +static inline u32 tcf_vlan_action(const struct tc_action *a) +{ + return to_vlan(a)->tcfv_action; +} + +static inline u16 tcf_vlan_push_vid(const struct tc_action *a) +{ + return to_vlan(a)->tcfv_push_vid; +} + +static inline __be16 tcf_vlan_push_proto(const struct tc_action *a) +{ + return to_vlan(a)->tcfv_push_proto; +} + #endif /* __NET_TC_VLAN_H */ -- cgit v1.2.3 From 732f794c1baf58e1eb2be4431635829c3da655bd Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 22 Sep 2016 16:49:22 -0400 Subject: net: dsa: add port fast ageing Today the DSA drivers are in charge of flushing the MAC addresses associated to a port when its STP state changes from Learning or Forwarding, to Disabled or Blocking or Listening. This makes the drivers more complex and hides the generic switch logic. Introduce a new optional port_fast_age operation to dsa_switch_ops, to move this logic to the DSA layer and keep drivers simple. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ net/dsa/slave.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 7556646db2d3..b122196d5a1f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -143,6 +143,7 @@ struct dsa_port { struct net_device *netdev; struct device_node *dn; unsigned int ageing_time; + u8 stp_state; }; struct dsa_switch { @@ -339,6 +340,7 @@ struct dsa_switch_ops { void (*port_bridge_leave)(struct dsa_switch *ds, int port); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); + void (*port_fast_age)(struct dsa_switch *ds, int port); /* * VLAN support diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fd78d4c9b197..6b1282c006b1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -71,8 +71,26 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) { + struct dsa_port *dp = &ds->ports[port]; + if (ds->ops->port_stp_state_set) ds->ops->port_stp_state_set(ds, port, state); + + if (ds->ops->port_fast_age) { + /* Fast age FDB entries or flush appropriate forwarding database + * for the given port, if we are moving it from Learning or + * Forwarding state, to Disabled or Blocking or Listening state. + */ + + if ((dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING) && + (state == BR_STATE_DISABLED || + state == BR_STATE_BLOCKING || + state == BR_STATE_LISTENING)) + ds->ops->port_fast_age(ds, port); + } + + dp->stp_state = state; } static int dsa_slave_open(struct net_device *dev) -- cgit v1.2.3 From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:37 +0200 Subject: bpf: add helper to invalidate hash Add a small helper that complements 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") for invalidating the current skb->hash after mangling on headers via direct packet write. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e07432b9f8b8..f09c70b97eca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -419,6 +419,13 @@ enum bpf_func_id { */ BPF_FUNC_csum_update, + /** + * bpf_set_hash_invalid(skb) + * Invalidate current skb>hash. + * @skb: pointer to skb + */ + BPF_FUNC_set_hash_invalid, + __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index acf84fbfb043..00351cdf7d0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1777,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { @@ -2534,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: -- cgit v1.2.3 From fc7ab6d29a3af0b7f6df7c095509378c8caf85b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 15:22:36 +0100 Subject: rxrpc: Add a tracepoint for the call timer Add a tracepoint to log call timer initiation, setting and expiry. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 36 ++++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 13 ++++++++++++- net/rxrpc/call_event.c | 7 ++++--- net/rxrpc/call_object.c | 6 ++++-- net/rxrpc/misc.c | 8 ++++++++ net/rxrpc/sendmsg.c | 2 +- 6 files changed, 65 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e8f2afbbe0bf..57322897d745 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -414,6 +414,42 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->avg) ); +TRACE_EVENT(rxrpc_timer, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why, + unsigned long now), + + TP_ARGS(call, why, now), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_timer_trace, why ) + __field(unsigned long, now ) + __field(unsigned long, expire_at ) + __field(unsigned long, ack_at ) + __field(unsigned long, resend_at ) + __field(unsigned long, timer ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->now = now; + __entry->expire_at = call->expire_at; + __entry->ack_at = call->ack_at; + __entry->resend_at = call->resend_at; + __entry->timer = call->timer.expires; + ), + + TP_printk("c=%p %s now=%lx x=%ld a=%ld r=%ld t=%ld", + __entry->call, + rxrpc_timer_traces[__entry->why], + __entry->now, + __entry->expire_at - __entry->now, + __entry->ack_at - __entry->now, + __entry->resend_at - __entry->now, + __entry->timer - __entry->now) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a494d56eb236..e564eca75985 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -678,6 +678,17 @@ enum rxrpc_rtt_rx_trace { extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; +enum rxrpc_timer_trace { + rxrpc_timer_begin, + rxrpc_timer_expired, + rxrpc_timer_set_for_ack, + rxrpc_timer_set_for_resend, + rxrpc_timer_set_for_send, + rxrpc_timer__nr_trace +}; + +extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); @@ -707,7 +718,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void rxrpc_set_timer(struct rxrpc_call *); +void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool); void rxrpc_process_call(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8bc5c8e37ab4..90e970ba048a 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -24,7 +24,7 @@ /* * Set the timer */ -void rxrpc_set_timer(struct rxrpc_call *call) +void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why) { unsigned long t, now = jiffies; @@ -45,6 +45,7 @@ void rxrpc_set_timer(struct rxrpc_call *call) if (call->timer.expires != t || !timer_pending(&call->timer)) { mod_timer(&call->timer, t); + trace_rxrpc_timer(call, why, now); } } @@ -120,7 +121,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now); if (time_before(ack_at, call->ack_at)) { call->ack_at = ack_at; - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_set_for_ack); } } } @@ -293,7 +294,7 @@ recheck_state: goto recheck_state; } - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_set_for_resend); /* other events may have been raised since we started checking */ if (call->events && call->state < RXRPC_CALL_COMPLETE) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f2fadf667e19..a53f4c2c0025 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -76,8 +76,10 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) + if (call->state < RXRPC_CALL_COMPLETE) { + trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); rxrpc_queue_call(call); + } } /* @@ -200,7 +202,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) call->ack_at = expire_at; call->resend_at = expire_at; call->timer.expires = expire_at + 1; - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_begin); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index fe648711c2f7..fa9942fabdf2 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -194,3 +194,11 @@ const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { [rxrpc_rtt_rx_ping_response] = "PONG", [rxrpc_rtt_rx_requested_ack] = "RACK", }; + +const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { + [rxrpc_timer_begin] = "Begin ", + [rxrpc_timer_expired] = "*EXPR*", + [rxrpc_timer_set_for_ack] = "SetAck", + [rxrpc_timer_set_for_send] = "SetTx ", + [rxrpc_timer_set_for_resend] = "SetRTx", +}; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 93e6584cd751..99939372b5a4 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -153,7 +153,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (time_before(resend_at, call->resend_at)) { call->resend_at = resend_at; - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_set_for_send); } } -- cgit v1.2.3 From be832aecc5ba811728e15a10f675f4a2187f25dd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:22 +0100 Subject: rxrpc: Add data Tx tracepoint and adjust Tx ACK tracepoint Add a tracepoint to log transmission of DATA packets (including loss injection). Adjust the ACK transmission tracepoint to include the packet serial number and to line this up with the DATA transmission display. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 50 +++++++++++++++++++++++++++++++++++++------- net/rxrpc/conn_event.c | 5 ++--- net/rxrpc/output.c | 5 ++++- 3 files changed, 48 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 57322897d745..6001bf93dc79 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -256,33 +256,67 @@ TRACE_EVENT(rxrpc_rx_ack, __entry->n_acks) ); +TRACE_EVENT(rxrpc_tx_data, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, + rxrpc_serial_t serial, u8 flags, bool lose), + + TP_ARGS(call, seq, serial, flags, lose), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_seq_t, seq ) + __field(rxrpc_serial_t, serial ) + __field(u8, flags ) + __field(bool, lose ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->seq = seq; + __entry->serial = serial; + __entry->flags = flags; + __entry->lose = lose; + ), + + TP_printk("c=%p DATA %08x q=%08x fl=%02x%s", + __entry->call, + __entry->serial, + __entry->seq, + __entry->flags, + __entry->lose ? " *LOSE*" : "") + ); + TRACE_EVENT(rxrpc_tx_ack, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first, - rxrpc_serial_t serial, u8 reason, u8 n_acks), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, + u8 reason, u8 n_acks), - TP_ARGS(call, first, serial, reason, n_acks), + TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks), TP_STRUCT__entry( __field(struct rxrpc_call *, call ) - __field(rxrpc_seq_t, first ) __field(rxrpc_serial_t, serial ) + __field(rxrpc_seq_t, ack_first ) + __field(rxrpc_serial_t, ack_serial ) __field(u8, reason ) __field(u8, n_acks ) ), TP_fast_assign( __entry->call = call; - __entry->first = first; __entry->serial = serial; + __entry->ack_first = ack_first; + __entry->ack_serial = ack_serial; __entry->reason = reason; __entry->n_acks = n_acks; ), - TP_printk("c=%p %s f=%08x r=%08x n=%u", + TP_printk(" c=%p ACK %08x %s f=%08x r=%08x n=%u", __entry->call, - rxrpc_acks(__entry->reason), - __entry->first, __entry->serial, + rxrpc_acks(__entry->reason), + __entry->ack_first, + __entry->ack_serial, __entry->n_acks) ); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 75a15a4c74c3..a1cf1ec5f29e 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -98,9 +98,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.info.rwind = htonl(rxrpc_rx_window_size); pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); len += sizeof(pkt.ack) + sizeof(pkt.info); - - trace_rxrpc_tx_ack(NULL, chan->last_seq, 0, - RXRPC_ACK_DUPLICATE, 0); break; } @@ -122,6 +119,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); break; case RXRPC_PACKET_TYPE_ACK: + trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0, + RXRPC_ACK_DUPLICATE, 0); _proto("Tx ACK %%%u [re]", serial); break; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5c1e008a5323..e47fbd1c836d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -177,7 +177,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) pkt->whdr.serial = htonl(serial); switch (type) { case RXRPC_PACKET_TYPE_ACK: - trace_rxrpc_tx_ack(call, + trace_rxrpc_tx_ack(call, serial, ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); @@ -275,6 +275,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, + whdr.flags, true); rxrpc_lose_skb(skb, rxrpc_skb_tx_lost); _leave(" = 0 [lose]"); return 0; @@ -302,6 +304,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) goto send_fragmentable; done: + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false); if (ret >= 0) { ktime_t now = ktime_get_real(); skb->tstamp = now; -- cgit v1.2.3 From 89b475abdb107a74f57497b65becaf837a0e5b6b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:22 +0100 Subject: rxrpc: Add a tracepoint to log injected Rx packet loss Add a tracepoint to log received packets that get discarded due to Rx packet loss. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 21 +++++++++++++++++++++ net/rxrpc/input.c | 11 +++++------ 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 6001bf93dc79..9413b17ba04b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -484,6 +484,27 @@ TRACE_EVENT(rxrpc_timer, __entry->timer - __entry->now) ); +TRACE_EVENT(rxrpc_rx_lose, + TP_PROTO(struct rxrpc_skb_priv *sp), + + TP_ARGS(sp), + + TP_STRUCT__entry( + __field_struct(struct rxrpc_host_header, hdr ) + ), + + TP_fast_assign( + memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); + ), + + TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*", + __entry->hdr.epoch, __entry->hdr.cid, + __entry->hdr.callNumber, __entry->hdr.serviceId, + __entry->hdr.serial, __entry->hdr.seq, + __entry->hdr.type, __entry->hdr.flags, + __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK") + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index fb3e2f6afa3b..19b1e189f5dc 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -837,20 +837,19 @@ void rxrpc_data_ready(struct sock *udp_sk) skb_orphan(skb); sp = rxrpc_skb(skb); + /* dig out the RxRPC connection details */ + if (rxrpc_extract_header(sp, skb) < 0) + goto bad_message; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { + trace_rxrpc_rx_lose(sp); rxrpc_lose_skb(skb, rxrpc_skb_rx_lost); return; } } - _net("Rx UDP packet from %08x:%04hu", - ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); - - /* dig out the RxRPC connection details */ - if (rxrpc_extract_header(sp, skb) < 0) - goto bad_message; trace_rxrpc_rx_packet(sp); _net("Rx RxRPC %s ep=%x call=%x:%x", -- cgit v1.2.3 From 9c7ad434441da6b5d4ac878cac368fbdaec99b56 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 13:50:40 +0100 Subject: rxrpc: Add tracepoint for ACK proposal Add a tracepoint to log proposed ACKs, including whether the proposal is used to update a pending ACK or is discarded in favour of an easlier, higher priority ACK. Whilst we're at it, get rid of the rxrpc_acks() function and access the name array directly. We do, however, need to validate the ACK reason number given to trace_rxrpc_rx_ack() to make sure we don't overrun the array. Signed-off-by: David Howells --- include/rxrpc/packet.h | 1 + include/trace/events/rxrpc.h | 42 ++++++++++++++++++++++++++++++++++++++++-- net/rxrpc/ar-internal.h | 25 +++++++++++++++++++++++-- net/rxrpc/call_event.c | 21 ++++++++++++++------- net/rxrpc/input.c | 19 +++++++++++++------ net/rxrpc/misc.c | 30 +++++++++++++++++++----------- net/rxrpc/output.c | 3 ++- net/rxrpc/recvmsg.c | 3 ++- 8 files changed, 114 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index fd6eb3a60a8c..703a64b4681a 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -123,6 +123,7 @@ struct rxrpc_ackpacket { #define RXRPC_ACK_PING_RESPONSE 7 /* response to RXRPC_ACK_PING */ #define RXRPC_ACK_DELAY 8 /* nothing happened since received packet */ #define RXRPC_ACK_IDLE 9 /* ACK due to fully received ACK window */ +#define RXRPC_ACK__INVALID 10 /* Representation of invalid ACK reason */ uint8_t nAcks; /* number of ACKs */ #define RXRPC_MAXACKS 255 diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 9413b17ba04b..d67a8c6b085a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -251,7 +251,7 @@ TRACE_EVENT(rxrpc_rx_ack, TP_printk("c=%p %s f=%08x n=%u", __entry->call, - rxrpc_acks(__entry->reason), + rxrpc_ack_names[__entry->reason], __entry->first, __entry->n_acks) ); @@ -314,7 +314,7 @@ TRACE_EVENT(rxrpc_tx_ack, TP_printk(" c=%p ACK %08x %s f=%08x r=%08x n=%u", __entry->call, __entry->serial, - rxrpc_acks(__entry->reason), + rxrpc_ack_names[__entry->reason], __entry->ack_first, __entry->ack_serial, __entry->n_acks) @@ -505,6 +505,44 @@ TRACE_EVENT(rxrpc_rx_lose, __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK") ); +TRACE_EVENT(rxrpc_propose_ack, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, + u8 ack_reason, rxrpc_serial_t serial, bool immediate, + bool background, enum rxrpc_propose_ack_outcome outcome), + + TP_ARGS(call, why, ack_reason, serial, immediate, background, + outcome), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_propose_ack_trace, why ) + __field(rxrpc_serial_t, serial ) + __field(u8, ack_reason ) + __field(bool, immediate ) + __field(bool, background ) + __field(enum rxrpc_propose_ack_outcome, outcome ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->why = why; + __entry->serial = serial; + __entry->ack_reason = ack_reason; + __entry->immediate = immediate; + __entry->background = background; + __entry->outcome = outcome; + ), + + TP_printk("c=%p %s %s r=%08x i=%u b=%u%s", + __entry->call, + rxrpc_propose_ack_traces[__entry->why], + rxrpc_ack_names[__entry->ack_reason], + __entry->serial, + __entry->immediate, + __entry->background, + rxrpc_propose_ack_outcomes[__entry->outcome]) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e564eca75985..042dbcc52654 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -689,8 +689,28 @@ enum rxrpc_timer_trace { extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; +enum rxrpc_propose_ack_trace { + rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_params, + rxrpc_propose_ack_respond_to_ack, + rxrpc_propose_ack_respond_to_ping, + rxrpc_propose_ack_retry_tx, + rxrpc_propose_ack_terminal_ack, + rxrpc_propose_ack__nr_trace +}; + +enum rxrpc_propose_ack_outcome { + rxrpc_propose_ack_use, + rxrpc_propose_ack_update, + rxrpc_propose_ack_subsume, + rxrpc_propose_ack__nr_outcomes +}; + +extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; +extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; + extern const char *const rxrpc_pkts[]; -extern const char *rxrpc_acks(u8 reason); +extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; #include @@ -719,7 +739,8 @@ int rxrpc_reject_call(struct rxrpc_sock *); * call_event.c */ void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, + enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 90e970ba048a..fd5b11339ffb 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -58,14 +58,13 @@ out: */ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, u16 skew, u32 serial, bool immediate, - bool background) + bool background, + enum rxrpc_propose_ack_trace why) { + enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; - _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), serial, immediate); - /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial * numbers, but we don't alter the timeout. */ @@ -74,15 +73,18 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]); if (ack_reason == call->ackr_reason) { if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) { + outcome = rxrpc_propose_ack_update; call->ackr_serial = serial; call->ackr_skew = skew; } if (!immediate) - return; + goto trace; } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { call->ackr_reason = ack_reason; call->ackr_serial = serial; call->ackr_skew = skew; + } else { + outcome = rxrpc_propose_ack_subsume; } switch (ack_reason) { @@ -124,17 +126,22 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_set_timer(call, rxrpc_timer_set_for_ack); } } + +trace: + trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate, + background, outcome); } /* * propose an ACK be sent, locking the call structure */ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u16 skew, u32 serial, bool immediate, bool background) + u16 skew, u32 serial, bool immediate, bool background, + enum rxrpc_propose_ack_trace why) { spin_lock_bh(&call->lock); __rxrpc_propose_ACK(call, ack_reason, skew, serial, - immediate, background); + immediate, background, why); spin_unlock_bh(&call->lock); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 19b1e189f5dc..349698d87ad1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -49,7 +49,8 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, if (call->peer->rtt_usage < 3 || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, - true, true); + true, true, + rxrpc_propose_ack_ping_for_params); } /* @@ -382,7 +383,8 @@ skip: ack: if (ack) rxrpc_propose_ACK(call, ack, skew, ack_serial, - immediate_ack, true); + immediate_ack, true, + rxrpc_propose_ack_input_data); if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) rxrpc_notify_socket(call); @@ -539,6 +541,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { + u8 ack_reason; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); union { struct rxrpc_ackpacket ack; @@ -561,8 +564,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, first_soft_ack = ntohl(buf.ack.firstPacket); hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; + ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? + buf.ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks); + trace_rxrpc_rx_ack(call, first_soft_ack, ack_reason, nr_acks); _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", sp->hdr.serial, @@ -570,7 +575,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, first_soft_ack, ntohl(buf.ack.previousPacket), acked_serial, - rxrpc_acks(buf.ack.reason), + rxrpc_ack_names[ack_reason], buf.ack.nAcks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) @@ -583,10 +588,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", sp->hdr.serial); rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - skew, sp->hdr.serial, true, true); + skew, sp->hdr.serial, true, true, + rxrpc_propose_ack_respond_to_ping); } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, - skew, sp->hdr.serial, true, true); + skew, sp->hdr.serial, true, true, + rxrpc_propose_ack_respond_to_ack); } offset = sp->offset + nr_acks + 3; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index fa9942fabdf2..1ca14835d87f 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -91,17 +91,10 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_PING] = 9, }; -const char *rxrpc_acks(u8 reason) -{ - static const char *const str[] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", - "IDL", "-?-" - }; - - if (reason >= ARRAY_SIZE(str)) - reason = ARRAY_SIZE(str) - 1; - return str[reason]; -} +const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", + "IDL", "-?-" +}; const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { [rxrpc_skb_rx_cleaned] = "Rx CLN", @@ -202,3 +195,18 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { [rxrpc_timer_set_for_send] = "SetTx ", [rxrpc_timer_set_for_resend] = "SetRTx", }; + +const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { + [rxrpc_propose_ack_input_data] = "DataIn ", + [rxrpc_propose_ack_ping_for_params] = "Params ", + [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", + [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", + [rxrpc_propose_ack_retry_tx] = "RetryTx", + [rxrpc_propose_ack_terminal_ack] = "ClTerm ", +}; + +const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { + [rxrpc_propose_ack_use] = "", + [rxrpc_propose_ack_update] = " Update", + [rxrpc_propose_ack_subsume] = " Subsume", +}; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e47fbd1c836d..0c563e325c9d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -210,7 +210,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), - true, true); + true, true, + rxrpc_propose_ack_retry_tx); break; case RXRPC_PACKET_TYPE_ABORT: break; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 99e4c0ae30f1..8c7f3de45bac 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -141,7 +141,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false); + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false, + rxrpc_propose_ack_terminal_ack); rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); } -- cgit v1.2.3 From c6672e3fe4a641bf302d6309ab4d5ee55648e758 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 13:58:55 +0100 Subject: rxrpc: Add a tracepoint to log which packets will be retransmitted Add a tracepoint to log in rxrpc_resend() which packets will be retransmitted. Note that if a positive ACK comes in whilst we have dropped the lock to retransmit another packet, the actual retransmission may not happen, though some of the effects will (such as altering the congestion management). Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 27 +++++++++++++++++++++++++++ net/rxrpc/call_event.c | 2 ++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d67a8c6b085a..56475497043d 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -543,6 +543,33 @@ TRACE_EVENT(rxrpc_propose_ack, rxrpc_propose_ack_outcomes[__entry->outcome]) ); +TRACE_EVENT(rxrpc_retransmit, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, + s64 expiry), + + TP_ARGS(call, seq, annotation, expiry), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_seq_t, seq ) + __field(u8, annotation ) + __field(s64, expiry ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->seq = seq; + __entry->annotation = annotation; + __entry->expiry = expiry; + ), + + TP_printk("c=%p q=%x a=%02x xp=%lld", + __entry->call, + __entry->seq, + __entry->annotation, + __entry->expiry) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fd5b11339ffb..a78a92fe5d77 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -196,6 +196,8 @@ static void rxrpc_resend(struct rxrpc_call *call) /* Okay, we need to retransmit a packet. */ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; + trace_rxrpc_retransmit(call, seq, annotation | anno_type, + ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now); -- cgit v1.2.3 From 7c3d21c8153c6bfb5690e35e086b0522c42442d9 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:13 +0300 Subject: net/mlx4_core: Preparation for VF vlan protocol 802.1ad Check device capability to support VF vlan protocol 802.1ad mode. Add vport attribute vlan protocol. Init vport vlan protocol by default to 802.1Q. Add update QP support for VF vlan protocol 802.1ad. Add func capability vlan_offload_disable to disable all vlan HW acceleration on VF while the VF is set to VF vlan protocol 802.1ad mode. No change in VF vlan protocol 802.1Q (VST) mode. Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 7 ++++ drivers/net/ethernet/mellanox/mlx4/fw.c | 37 ++++++++++++++++---- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 2 ++ .../net/ethernet/mellanox/mlx4/resource_tracker.c | 40 ++++++++++++++++++---- include/linux/mlx4/device.h | 3 ++ include/linux/mlx4/qp.h | 2 ++ 6 files changed, 78 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index a58d96cf1ed1..09c969420eac 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1851,6 +1851,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv, if (vp_oper->state.default_vlan == vp_admin->default_vlan && vp_oper->state.default_qos == vp_admin->default_qos && + vp_oper->state.vlan_proto == vp_admin->vlan_proto && vp_oper->state.link_state == vp_admin->link_state && vp_oper->state.qos_vport == vp_admin->qos_vport) return 0; @@ -1909,6 +1910,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv, vp_oper->state.default_vlan = vp_admin->default_vlan; vp_oper->state.default_qos = vp_admin->default_qos; + vp_oper->state.vlan_proto = vp_admin->vlan_proto; vp_oper->state.link_state = vp_admin->link_state; vp_oper->state.qos_vport = vp_admin->qos_vport; @@ -1922,6 +1924,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv, work->qos_vport = vp_oper->state.qos_vport; work->vlan_id = vp_oper->state.default_vlan; work->vlan_ix = vp_oper->vlan_idx; + work->vlan_proto = vp_oper->state.vlan_proto; work->priv = priv; INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler); queue_work(priv->mfunc.master.comm_wq, &work->work); @@ -2012,6 +2015,8 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave) vp_admin->default_vlan, &(vp_oper->vlan_idx)); if (err) { vp_oper->vlan_idx = NO_INDX; + vp_oper->state.default_vlan = MLX4_VGT; + vp_oper->state.vlan_proto = htons(ETH_P_8021Q); mlx4_warn(&priv->dev, "No vlan resources slave %d, port %d\n", slave, port); @@ -2388,6 +2393,8 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) admin_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT; oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT; + admin_vport->vlan_proto = htons(ETH_P_8021Q); + oper_vport->vlan_proto = htons(ETH_P_8021Q); vf_oper->vport[port].vlan_idx = NO_INDX; vf_oper->vport[port].mac_idx = NO_INDX; mlx4_set_random_admin_guid(dev, i, port); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index c7523307b41a..7dc9d38a51f8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -158,7 +158,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [31] = "Modifying loopback source checks using UPDATE_QP support", [32] = "Loopback source checks support", [33] = "RoCEv2 support", - [34] = "DMFS Sniffer support (UC & MC)" + [34] = "DMFS Sniffer support (UC & MC)", + [35] = "QinQ VST mode support", }; int i; @@ -313,12 +314,15 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31) #define QUERY_FUNC_CAP_PHV_BIT 0x40 +#define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE 0x20 if (vhcr->op_modifier == 1) { struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); int converted_port = mlx4_slave_convert_port( dev, slave, vhcr->in_modifier); + struct mlx4_vport_oper_state *vp_oper = + &priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier]; if (converted_port < 0) return -EINVAL; @@ -357,11 +361,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier], QUERY_FUNC_CAP_PHYS_PORT_ID); - if (dev->caps.phv_bit[port]) { - field = QUERY_FUNC_CAP_PHV_BIT; - MLX4_PUT(outbox->buf, field, - QUERY_FUNC_CAP_FLAGS0_OFFSET); - } + field = 0; + if (dev->caps.phv_bit[port]) + field |= QUERY_FUNC_CAP_PHV_BIT; + if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) + field |= QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE; + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET); } else if (vhcr->op_modifier == 0) { struct mlx4_active_ports actv_ports = @@ -689,6 +694,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET 0x52 #define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET 0x55 #define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET 0x56 +#define QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET 0x5D #define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET 0x61 #define QUERY_DEV_CAP_RSVD_MCG_OFFSET 0x62 #define QUERY_DEV_CAP_MAX_MCG_OFFSET 0x63 @@ -856,6 +862,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET); dev_cap->max_sq_desc_sz = size; + MLX4_GET(field, outbox, QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET); + if (field & 0x1) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET); dev_cap->max_qp_per_mcg = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET); @@ -2937,6 +2946,22 @@ int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val) } EXPORT_SYMBOL(set_phv_bit); +int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, + bool *vlan_offload_disabled) +{ + struct mlx4_func_cap func_cap; + int err; + + memset(&func_cap, 0, sizeof(func_cap)); + err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap); + if (!err) + *vlan_offload_disabled = + !!(func_cap.flags0 & + QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE); + return err; +} +EXPORT_SYMBOL(mlx4_get_is_vlan_offload_disabled); + void mlx4_replace_zero_macs(struct mlx4_dev *dev) { int i; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index c128ba3ef014..fdfe1ace07d7 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -508,6 +508,7 @@ struct mlx4_vport_state { u64 mac; u16 default_vlan; u8 default_qos; + __be16 vlan_proto; u32 tx_rate; bool spoofchk; u32 link_state; @@ -657,6 +658,7 @@ struct mlx4_vf_immed_vlan_work { u8 qos_vport; u16 vlan_id; u16 orig_vlan_id; + __be16 vlan_proto; }; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 8b81114bdc72..84d7857ccc27 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -790,10 +790,22 @@ static int update_vport_qp_param(struct mlx4_dev *dev, MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED | MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED; } else if (0 != vp_oper->state.default_vlan) { - qpc->pri_path.vlan_control |= - MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | - MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | - MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; + if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) { + /* vst QinQ should block untagged on TX, + * but cvlan is in payload and phv is set so + * hw see it as untagged. Block tagged instead. + */ + qpc->pri_path.vlan_control |= + MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; + } else { /* vst 802.1Q */ + qpc->pri_path.vlan_control |= + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; + } } else { /* priority tagged */ qpc->pri_path.vlan_control |= MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | @@ -802,7 +814,11 @@ static int update_vport_qp_param(struct mlx4_dev *dev, qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN; qpc->pri_path.vlan_index = vp_oper->vlan_idx; - qpc->pri_path.fl |= MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN; + qpc->pri_path.fl |= MLX4_FL_ETH_HIDE_CQE_VLAN; + if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) + qpc->pri_path.fl |= MLX4_FL_SV; + else + qpc->pri_path.fl |= MLX4_FL_CV; qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN; qpc->pri_path.sched_queue &= 0xC7; qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3; @@ -5238,6 +5254,7 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work) u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) | (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) | (1ULL << MLX4_UPD_QP_PATH_MASK_CV) | + (1ULL << MLX4_UPD_QP_PATH_MASK_SV) | (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) | (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) | (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) | @@ -5266,7 +5283,12 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work) else if (!work->vlan_id) vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED; - else + else if (work->vlan_proto == htons(ETH_P_8021AD)) + vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; + else /* vst 802.1Q */ vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; @@ -5311,7 +5333,11 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work) upd_context->qp_context.pri_path.fvl_rx = qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN; upd_context->qp_context.pri_path.fl = - qp->pri_path_fl | MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN; + qp->pri_path_fl | MLX4_FL_ETH_HIDE_CQE_VLAN; + if (work->vlan_proto == htons(ETH_P_8021AD)) + upd_context->qp_context.pri_path.fl |= MLX4_FL_SV; + else + upd_context->qp_context.pri_path.fl |= MLX4_FL_CV; upd_context->qp_context.pri_path.feup = qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN; upd_context->qp_context.pri_path.sched_queue = diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 42da3552f7cb..59b50d3eedb4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -221,6 +221,7 @@ enum { MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER = 1ULL << 34, MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT = 1ULL << 35, + MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP = 1ULL << 36, }; enum { @@ -1371,6 +1372,8 @@ int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable); int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val); int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv); +int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, + bool *vlan_offload_disabled); int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index deaa2217214d..b4ee8f62ce8d 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -160,6 +160,7 @@ struct mlx4_qp_path { enum { /* fl */ MLX4_FL_CV = 1 << 6, + MLX4_FL_SV = 1 << 5, MLX4_FL_ETH_HIDE_CQE_VLAN = 1 << 2, MLX4_FL_ETH_SRC_CHECK_MC_LB = 1 << 1, MLX4_FL_ETH_SRC_CHECK_UC_LB = 1 << 0, @@ -267,6 +268,7 @@ enum { MLX4_UPD_QP_PATH_MASK_FVL_RX = 16 + 32, MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_UC_LB = 18 + 32, MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB = 19 + 32, + MLX4_UPD_QP_PATH_MASK_SV = 22 + 32, }; enum { /* param3 */ -- cgit v1.2.3 From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:15 +0300 Subject: net: Update API for VF vlan protocol 802.1ad support Introduce new rtnl UAPI that exposes a list of vlans per VF, giving the ability for user-space application to specify it for the VF, as an option to support 802.1ad. We adjusted IP Link tool to support this option. For future use cases, the new UAPI supports multiple vlans. For now we limit the list size to a single vlan in kernel. Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward compatibility with older versions of IP Link tool. Add a vlan protocol parameter to the ndo_set_vf_vlan callback. We kept 802.1Q as the drivers' default vlan protocol. Suitable ip link tool command examples: Set vf vlan protocol 802.1ad: ip link set eth0 vf 1 vlan 100 proto 802.1ad Set vf to VST (802.1Q) mode: ip link set eth0 vf 1 vlan 100 proto 802.1Q Or by omitting the new parameter ip link set eth0 vf 1 vlan 100 Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 9 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 6 +- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h | 2 +- drivers/net/ethernet/emulex/benet/be_main.c | 6 +- drivers/net/ethernet/intel/fm10k/fm10k.h | 2 +- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 6 +- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 4 +- drivers/net/ethernet/intel/igb/igb_main.c | 9 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 5 +- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 6 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h | 2 +- .../net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c | 5 +- drivers/net/ethernet/sfc/sriov.c | 5 +- drivers/net/ethernet/sfc/sriov.h | 2 +- include/linux/if_link.h | 1 + include/linux/netdevice.h | 6 +- include/uapi/linux/if_link.h | 19 ++++- net/core/rtnetlink.c | 80 ++++++++++++++++++---- 23 files changed, 161 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index 0e68fadecfdb..243cb9748d35 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -492,7 +492,8 @@ int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto, int bnx2x_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi); int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac); -int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos); +int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto); /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 6c586b045d1d..3f77d0863543 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -2521,7 +2521,8 @@ void bnx2x_pf_set_vfs_vlan(struct bnx2x *bp) for_each_vf(bp, vfidx) { bulletin = BP_VF_BULLETIN(bp, vfidx); if (bulletin->valid_bitmap & (1 << VLAN_VALID)) - bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0); + bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0, + htons(ETH_P_8021Q)); } } @@ -2781,7 +2782,8 @@ static int bnx2x_set_vf_vlan_filter(struct bnx2x *bp, struct bnx2x_virtf *vf, return 0; } -int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos) +int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos, + __be16 vlan_proto) { struct pf_vf_bulletin_content *bulletin = NULL; struct bnx2x *bp = netdev_priv(dev); @@ -2796,6 +2798,9 @@ int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos) return -EINVAL; } + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + DP(BNX2X_MSG_IOV, "configuring VF %d with VLAN %d qos %d\n", vfidx, vlan, 0); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 8be718508600..ec6cd18842c3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -174,7 +174,8 @@ int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac) return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); } -int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos) +int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos, + __be16 vlan_proto) { struct hwrm_func_cfg_input req = {0}; struct bnxt *bp = netdev_priv(dev); @@ -185,6 +186,9 @@ int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos) if (bp->hwrm_spec_code < 0x10201) return -ENOTSUPP; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + rc = bnxt_vf_ndo_prep(bp, vf_id); if (rc) return rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h index 0392670ab49c..1ab72e4820af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h @@ -12,7 +12,7 @@ int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *); int bnxt_set_vf_mac(struct net_device *, int, u8 *); -int bnxt_set_vf_vlan(struct net_device *, int, u16, u8); +int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16); int bnxt_set_vf_bw(struct net_device *, int, int, int); int bnxt_set_vf_link_state(struct net_device *, int, int); int bnxt_set_vf_spoofchk(struct net_device *, int, bool); diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 9a94840c5757..ac513e6627d1 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1895,7 +1895,8 @@ static int be_clear_vf_tvt(struct be_adapter *adapter, int vf) return 0; } -static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) +static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct be_adapter *adapter = netdev_priv(netdev); struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf]; @@ -1907,6 +1908,9 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + if (vlan || qos) { vlan |= qos << VLAN_PRIO_SHIFT; status = be_set_vf_tvt(adapter, vf, vlan); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index 67ff01aeb11a..4d19e46f7c55 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -507,7 +507,7 @@ int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs); s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid); int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac); int fm10k_ndo_set_vf_vlan(struct net_device *netdev, - int vf_idx, u16 vid, u8 qos); + int vf_idx, u16 vid, u8 qos, __be16 vlan_proto); int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate, int unused); int fm10k_ndo_get_vf_config(struct net_device *netdev, diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index d9dec81f6b6d..5f4dac0d36ef 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -445,7 +445,7 @@ int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac) } int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, - u8 qos) + u8 qos, __be16 vlan_proto) { struct fm10k_intfc *interface = netdev_priv(netdev); struct fm10k_iov_data *iov_data = interface->iov_data; @@ -460,6 +460,10 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, if (qos || (vid > (VLAN_VID_MASK - 1))) return -EINVAL; + /* VF VLAN Protocol part to default is unsupported */ + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + vf_info = &iov_data->vf_info[vf_idx]; /* exit if there is nothing to do */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index da3423561b3a..724d8740d4cc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2747,11 +2747,12 @@ error_param: * @vf_id: VF identifier * @vlan_id: mac address * @qos: priority setting + * @vlan_proto: vlan protocol * * program VF vlan id and/or qos **/ -int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, - int vf_id, u16 vlan_id, u8 qos) +int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, + u16 vlan_id, u8 qos, __be16 vlan_proto) { u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT); struct i40e_netdev_priv *np = netdev_priv(netdev); @@ -2774,6 +2775,12 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, goto error_pvid; } + if (vlan_proto != htons(ETH_P_8021Q)) { + dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n"); + ret = -EPROTONOSUPPORT; + goto error_pvid; + } + vf = &(pf->vf[vf_id]); vsi = pf->vsi[vf->lan_vsi_idx]; if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states)) { diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 875174141451..4012d069939a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -129,8 +129,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf); /* VF configuration related iplink handlers */ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac); -int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, - int vf_id, u16 vlan_id, u8 qos); +int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, + u16 vlan_id, u8 qos, __be16 vlan_proto); int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate, int max_tx_rate); int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index af75eac5fa16..a83aa13a5bf4 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -169,7 +169,7 @@ static int igb_set_vf_mac(struct igb_adapter *, int, unsigned char *); static void igb_restore_vf_multicasts(struct igb_adapter *adapter); static int igb_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac); static int igb_ndo_set_vf_vlan(struct net_device *netdev, - int vf, u16 vlan, u8 qos); + int vf, u16 vlan, u8 qos, __be16 vlan_proto); static int igb_ndo_set_vf_bw(struct net_device *, int, int, int); static int igb_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting); @@ -6222,14 +6222,17 @@ static int igb_disable_port_vlan(struct igb_adapter *adapter, int vf) return 0; } -static int igb_ndo_set_vf_vlan(struct net_device *netdev, - int vf, u16 vlan, u8 qos) +static int igb_ndo_set_vf_vlan(struct net_device *netdev, int vf, + u16 vlan, u8 qos, __be16 vlan_proto) { struct igb_adapter *adapter = netdev_priv(netdev); if ((vf >= adapter->vfs_allocated_count) || (vlan > 4095) || (qos > 7)) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return (vlan || qos) ? igb_enable_port_vlan(adapter, vf, vlan, qos) : igb_disable_port_vlan(adapter, vf); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 8618599dfd6f..b18590a995db 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -1354,13 +1354,16 @@ static int ixgbe_disable_port_vlan(struct ixgbe_adapter *adapter, int vf) return err; } -int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) +int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, + u8 qos, __be16 vlan_proto) { int err = 0; struct ixgbe_adapter *adapter = netdev_priv(netdev); if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7)) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; if (vlan || qos) { /* Check if there is already a port VLAN set, if so * we have to delete the old one first before we diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h index 47e65e2f886a..0c7977d27b71 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h @@ -43,7 +43,7 @@ void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter); void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter); int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac); int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan, - u8 qos); + u8 qos, __be16 vlan_proto); int ixgbe_link_mbps(struct ixgbe_adapter *adapter); int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate, int max_tx_rate); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index a94f8a3f026c..132eeeafcdc4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2400,11 +2400,15 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac) return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64); } -static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos) +static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct mlx4_en_priv *en_priv = netdev_priv(dev); struct mlx4_en_dev *mdev = en_priv->mdev; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c12792314be7..b58cfe37dead 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2917,11 +2917,15 @@ static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac) return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac); } -static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos) +static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1, vlan, qos); } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index cd23a2946db7..0e198fe89d1a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -100,7 +100,8 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev, static void qede_link_update(void *dev, struct qed_link_output *link); #ifdef CONFIG_QED_SRIOV -static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos) +static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct qede_dev *edev = netdev_priv(ndev); @@ -109,6 +110,9 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos) return -EINVAL; } + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n", vlan, vf); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h index 24061b9b92e8..5f327659efa7 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h @@ -238,7 +238,7 @@ int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *); int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int); int qlcnic_sriov_get_vf_config(struct net_device *, int , struct ifla_vf_info *); -int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8); +int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8, __be16); int qlcnic_sriov_set_vf_spoofchk(struct net_device *, int, bool); #else static inline void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter) {} diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c index afd687e5e779..50eaafa3eaba 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c @@ -1915,7 +1915,7 @@ int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf, } int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf, - u16 vlan, u8 qos) + u16 vlan, u8 qos, __be16 vlan_proto) { struct qlcnic_adapter *adapter = netdev_priv(netdev); struct qlcnic_sriov *sriov = adapter->ahw->sriov; @@ -1928,6 +1928,9 @@ int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf, if (vf >= sriov->num_vfs || qos > 7) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + if (vlan > MAX_VLAN_ID) { netdev_err(netdev, "Invalid VLAN ID, allowed range is [0 - %d]\n", diff --git a/drivers/net/ethernet/sfc/sriov.c b/drivers/net/ethernet/sfc/sriov.c index 816c44689e67..9abcf4aded30 100644 --- a/drivers/net/ethernet/sfc/sriov.c +++ b/drivers/net/ethernet/sfc/sriov.c @@ -22,7 +22,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac) } int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan, - u8 qos) + u8 qos, __be16 vlan_proto) { struct efx_nic *efx = netdev_priv(net_dev); @@ -31,6 +31,9 @@ int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan, (qos & ~(VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT))) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return efx->type->sriov_set_vf_vlan(efx, vf_i, vlan, qos); } else { return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/sfc/sriov.h b/drivers/net/ethernet/sfc/sriov.h index 400df526586d..ba1762e7f216 100644 --- a/drivers/net/ethernet/sfc/sriov.h +++ b/drivers/net/ethernet/sfc/sriov.h @@ -16,7 +16,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac); int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan, - u8 qos); + u8 qos, __be16 vlan_proto); int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i, bool spoofchk); int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i, diff --git a/include/linux/if_link.h b/include/linux/if_link.h index f923d15b432c..0b17c585b5cd 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -25,5 +25,6 @@ struct ifla_vf_info { __u32 max_tx_rate; __u32 rss_query_en; __u32 trusted; + __be16 vlan_proto; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69f242c71865..1e8a5c734d72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -946,7 +946,8 @@ struct netdev_xdp { * * SR-IOV management functions. * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); - * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos); + * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, + * u8 qos, __be16 proto); * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, * int max_tx_rate); * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); @@ -1187,7 +1188,8 @@ struct net_device_ops { int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, - int queue, u16 vlan, u8 qos); + int queue, u16 vlan, + u8 qos, __be16 proto); int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7ec9e99d5491..b4fba662cd32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -619,7 +619,7 @@ enum { enum { IFLA_VF_UNSPEC, IFLA_VF_MAC, /* Hardware queue specific attributes */ - IFLA_VF_VLAN, + IFLA_VF_VLAN, /* VLAN ID and QoS */ IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ @@ -631,6 +631,7 @@ enum { IFLA_VF_TRUST, /* Trust VF */ IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ __IFLA_VF_MAX, }; @@ -647,6 +648,22 @@ struct ifla_vf_vlan { __u32 qos; }; +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + struct ifla_vf_tx_rate { __u32 vf; __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0dbae4244a89..3ac8946bf244 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += nla_total_size(num_vfs * sizeof(struct nlattr)); size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + - nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct nlattr)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + @@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct nlattr *vfinfo) { struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; - struct nlattr *vf, *vfstats; struct ifla_vf_mac vf_mac; struct ifla_vf_info ivi; @@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; vf_mac.vf = vf_vlan.vf = + vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = @@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; @@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vf) + goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), @@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) - return -EMSGSIZE; + goto nla_put_vf_failure; + vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vfstats) + goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, @@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) - return -EMSGSIZE; + vf_stats.multicast, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); +nla_put_vfinfo_failure: + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) @@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, @@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, - ivv->qos); + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < NLA_HDRLEN) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; + } + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } -- cgit v1.2.3 From b42959dc35a533a531dd698b581193a65a5da831 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:16 +0300 Subject: net/mlx4: Add VF vlan protocol 802.1ad support Move the vf to VST 802.1ad mode (mlx4 VST QinQ mode) by setting vf vlan protocol to 802.1ad. VST 802.1ad mode in mlx4, is used for STAG strip/insertion by PF, while the CTAG is set by the VF. Read current vlan protocol as part of the vf configuration state. Upon setting vf vlan protocol to 802.1ad, we use a mechanism of handshake to verify that both the vf and the pf driver version support it. The handshake uses the command QUERY_FUNC_CAP: - The vf sets a pre-defined support bit in input modifier. - A pf that supports the feature sends the request to the vf through a pre-defined field in the output mailbox. - In case vf does not support the feature, the pf will fail the control command (in this case, IP link tool command to set the vf vlan protocol to 802.1ad). No change in VST 802.1Q mode. Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 51 ++++++++++++++- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 6 +- drivers/net/ethernet/mellanox/mlx4/fw.c | 89 ++++++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 1 + include/linux/mlx4/cmd.h | 3 +- 5 files changed, 138 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 09c969420eac..b1cef7a0f7ca 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1995,6 +1995,8 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave) int port, err; struct mlx4_vport_state *vp_admin; struct mlx4_vport_oper_state *vp_oper; + struct mlx4_slave_state *slave_state = + &priv->mfunc.master.slave_state[slave]; struct mlx4_active_ports actv_ports = mlx4_get_active_ports( &priv->dev, slave); int min_port = find_first_bit(actv_ports.ports, @@ -2009,7 +2011,19 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave) priv->mfunc.master.vf_admin[slave].enable_smi[port]; vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; - vp_oper->state = *vp_admin; + if (vp_admin->vlan_proto != htons(ETH_P_8021AD) || + slave_state->vst_qinq_supported) { + vp_oper->state.vlan_proto = vp_admin->vlan_proto; + vp_oper->state.default_vlan = vp_admin->default_vlan; + vp_oper->state.default_qos = vp_admin->default_qos; + } + vp_oper->state.link_state = vp_admin->link_state; + vp_oper->state.mac = vp_admin->mac; + vp_oper->state.spoofchk = vp_admin->spoofchk; + vp_oper->state.tx_rate = vp_admin->tx_rate; + vp_oper->state.qos_vport = vp_admin->qos_vport; + vp_oper->state.guid = vp_admin->guid; + if (MLX4_VGT != vp_admin->default_vlan) { err = __mlx4_register_vlan(&priv->dev, port, vp_admin->default_vlan, &(vp_oper->vlan_idx)); @@ -2097,6 +2111,7 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, mlx4_warn(dev, "Received reset from slave:%d\n", slave); slave_state[slave].active = false; slave_state[slave].old_vlan_api = false; + slave_state[slave].vst_qinq_supported = false; mlx4_master_deactivate_admin_state(priv, slave); for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) { slave_state[slave].event_eq[i].eqn = -1; @@ -2364,6 +2379,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) vf_oper = &priv->mfunc.master.vf_oper[i]; s_state = &priv->mfunc.master.slave_state[i]; s_state->last_cmd = MLX4_COMM_CMD_RESET; + s_state->vst_qinq_supported = false; mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]); for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j) s_state->event_eq[j].eqn = -1; @@ -2955,10 +2971,13 @@ int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac) EXPORT_SYMBOL_GPL(mlx4_set_vf_mac); -int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos, + __be16 proto) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_vport_state *vf_admin; + struct mlx4_slave_state *slave_state; + struct mlx4_vport_oper_state *vf_oper; int slave; if ((!mlx4_is_master(dev)) || @@ -2968,12 +2987,31 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) if ((vlan > 4095) || (qos > 7)) return -EINVAL; + if (proto == htons(ETH_P_8021AD) && + !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP)) + return -EPROTONOSUPPORT; + + if (proto != htons(ETH_P_8021Q) && + proto != htons(ETH_P_8021AD)) + return -EINVAL; + + if ((proto == htons(ETH_P_8021AD)) && + ((vlan == 0) || (vlan == MLX4_VGT))) + return -EINVAL; + slave = mlx4_get_slave_indx(dev, vf); if (slave < 0) return -EINVAL; + slave_state = &priv->mfunc.master.slave_state[slave]; + if ((proto == htons(ETH_P_8021AD)) && (slave_state->active) && + (!slave_state->vst_qinq_supported)) { + mlx4_err(dev, "vf %d does not support VST QinQ mode\n", vf); + return -EPROTONOSUPPORT; + } port = mlx4_slaves_closest_port(dev, slave, port); vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + vf_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; if (!mlx4_valid_vf_state_change(dev, port, vf_admin, vlan, qos)) return -EPERM; @@ -2983,6 +3021,7 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) else vf_admin->default_vlan = vlan; vf_admin->default_qos = qos; + vf_admin->vlan_proto = proto; /* If rate was configured prior to VST, we saved the configured rate * in vf_admin->rate and now, if priority supported we enforce the QoS @@ -2991,7 +3030,12 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) vf_admin->tx_rate) vf_admin->qos_vport = slave; - if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port)) + /* Try to activate new vf state without restart, + * this option is not supported while moving to VST QinQ mode. + */ + if ((proto == htons(ETH_P_8021AD) && + vf_oper->state.vlan_proto != proto) || + mlx4_master_immediate_activate_vlan_qos(priv, slave, port)) mlx4_info(dev, "updating vf %d port %d config will take effect on next VF restart\n", vf, port); @@ -3135,6 +3179,7 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_in ivf->vlan = s_info->default_vlan; ivf->qos = s_info->default_qos; + ivf->vlan_proto = s_info->vlan_proto; if (mlx4_is_vf_vst_and_prio_qos(dev, port, s_info)) ivf->max_tx_rate = s_info->tx_rate; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 132eeeafcdc4..7e703bed7b82 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2406,10 +2406,8 @@ static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, struct mlx4_en_priv *en_priv = netdev_priv(dev); struct mlx4_en_dev *mdev = en_priv->mdev; - if (vlan_proto != htons(ETH_P_8021Q)) - return -EPROTONOSUPPORT; - - return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos); + return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos, + vlan_proto); } static int mlx4_en_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 7dc9d38a51f8..090bf81076e8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -249,6 +249,72 @@ out: return err; } +static int mlx4_activate_vst_qinq(struct mlx4_priv *priv, int slave, int port) +{ + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_vport_state *vp_admin; + int err; + + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + + if (vp_admin->default_vlan != vp_oper->state.default_vlan) { + err = __mlx4_register_vlan(&priv->dev, port, + vp_admin->default_vlan, + &vp_oper->vlan_idx); + if (err) { + vp_oper->vlan_idx = NO_INDX; + mlx4_warn(&priv->dev, + "No vlan resources slave %d, port %d\n", + slave, port); + return err; + } + mlx4_dbg(&priv->dev, "alloc vlan %d idx %d slave %d port %d\n", + (int)(vp_oper->state.default_vlan), + vp_oper->vlan_idx, slave, port); + } + vp_oper->state.vlan_proto = vp_admin->vlan_proto; + vp_oper->state.default_vlan = vp_admin->default_vlan; + vp_oper->state.default_qos = vp_admin->default_qos; + + return 0; +} + +static int mlx4_handle_vst_qinq(struct mlx4_priv *priv, int slave, int port) +{ + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_slave_state *slave_state; + struct mlx4_vport_state *vp_admin; + int err; + + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + slave_state = &priv->mfunc.master.slave_state[slave]; + + if ((vp_admin->vlan_proto != htons(ETH_P_8021AD)) || + (!slave_state->active)) + return 0; + + if (vp_oper->state.vlan_proto == vp_admin->vlan_proto && + vp_oper->state.default_vlan == vp_admin->default_vlan && + vp_oper->state.default_qos == vp_admin->default_qos) + return 0; + + if (!slave_state->vst_qinq_supported) { + /* Warn and revert the request to set vst QinQ mode */ + vp_admin->vlan_proto = vp_oper->state.vlan_proto; + vp_admin->default_vlan = vp_oper->state.default_vlan; + vp_admin->default_qos = vp_oper->state.default_qos; + + mlx4_warn(&priv->dev, + "Slave %d does not support VST QinQ mode\n", slave); + return 0; + } + + err = mlx4_activate_vst_qinq(priv, slave, port); + return err; +} + int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, @@ -312,17 +378,18 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, #define QUERY_FUNC_CAP_VF_ENABLE_QP0 0x08 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80 -#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31) #define QUERY_FUNC_CAP_PHV_BIT 0x40 #define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE 0x20 +#define QUERY_FUNC_CAP_SUPPORTS_VST_QINQ BIT(30) +#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS BIT(31) + if (vhcr->op_modifier == 1) { struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); int converted_port = mlx4_slave_convert_port( dev, slave, vhcr->in_modifier); - struct mlx4_vport_oper_state *vp_oper = - &priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier]; + struct mlx4_vport_oper_state *vp_oper; if (converted_port < 0) return -EINVAL; @@ -361,6 +428,11 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier], QUERY_FUNC_CAP_PHYS_PORT_ID); + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + err = mlx4_handle_vst_qinq(priv, slave, port); + if (err) + return err; + field = 0; if (dev->caps.phv_bit[port]) field |= QUERY_FUNC_CAP_PHV_BIT; @@ -371,6 +443,9 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, } else if (vhcr->op_modifier == 0) { struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + struct mlx4_slave_state *slave_state = + &priv->mfunc.master.slave_state[slave]; + /* enable rdma and ethernet interfaces, new quota locations, * and reserved lkey */ @@ -444,6 +519,10 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, size = dev->caps.reserved_lkey + ((slave << 8) & 0xFF00); MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET); + + if (vhcr->in_modifier & QUERY_FUNC_CAP_SUPPORTS_VST_QINQ) + slave_state->vst_qinq_supported = true; + } else err = -EINVAL; @@ -459,10 +538,12 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port, u32 size, qkey; int err = 0, quotas = 0; u32 in_modifier; + u32 slave_caps; op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */ - in_modifier = op_modifier ? gen_or_port : + slave_caps = QUERY_FUNC_CAP_SUPPORTS_VST_QINQ | QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS; + in_modifier = op_modifier ? gen_or_port : slave_caps; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index fdfe1ace07d7..e4878f31e45d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -483,6 +483,7 @@ struct mlx4_slave_state { u8 init_port_mask; bool active; bool old_vlan_api; + bool vst_qinq_supported; u8 function; dma_addr_t vhcr_dma; u16 mtu[MLX4_MAX_PORTS + 1]; diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 116b284bc4ce..1f3568694a57 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -309,7 +309,8 @@ int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx, struct ifla_vf_stats *vf_stats); u32 mlx4_comm_get_version(void); int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac); -int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, + u8 qos, __be16 proto); int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate, int max_tx_rate); int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting); -- cgit v1.2.3 From c5136b15ea364124299c8a9ba96b300e96061e3a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Sep 2016 11:35:01 -0400 Subject: netfilter: bridge: add and use br_nf_hook_thresh This replaces the last uses of NF_HOOK_THRESH(). Followup patch will remove it and rename nf_hook_thresh. The reason is that inet (non-bridge) netfilter no longer invokes the hooks from hooks, so we do no longer need the thresh value to skip hooks with a lower priority. The bridge netfilter however may need to do this. br_nf_hook_thresh is a wrapper that is supposed to do this, i.e. only call hooks with a priority that exceeds NF_BR_PRI_BRNF. It's used only in the recursion cases of br_netfilter. It invokes nf_hook_slow while holding an rcu read-side critical section to make a future cleanup simpler. Signed-off-by: Florian Westphal Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/br_netfilter.h | 6 ++++ net/bridge/br_netfilter_hooks.c | 60 ++++++++++++++++++++++++++++++------ net/bridge/br_netfilter_ipv6.c | 12 +++----- 3 files changed, 62 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index e8d1448425a7..0b0c35c37125 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -15,6 +15,12 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) void nf_bridge_update_protocol(struct sk_buff *skb); +int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk, + struct sk_buff *skb, struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct net *, struct sock *, + struct sk_buff *)); + static inline struct nf_bridge_info * nf_bridge_info_get(const struct sk_buff *skb) { diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 77e7f69bf80d..6029af47377d 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -395,11 +396,10 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, - NF_BR_PRE_ROUTING, - net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, + NULL, + br_nf_pre_routing_finish); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); @@ -417,10 +417,8 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - + br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, + br_handle_frame_finish); return 0; } @@ -992,6 +990,50 @@ static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; +/* recursively invokes nf_hook_slow (again), skipping already-called + * hooks (< NF_BR_PRI_BRNF). + * + * Called with rcu read lock held. + */ +int br_nf_hook_thresh(unsigned int hook, struct net *net, + struct sock *sk, struct sk_buff *skb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct nf_hook_ops *elem; + struct nf_hook_state state; + struct list_head *head; + int ret; + + head = &net->nf.hooks[NFPROTO_BRIDGE][hook]; + + list_for_each_entry_rcu(elem, head, list) { + struct nf_hook_ops *next; + + next = list_entry_rcu(list_next_rcu(&elem->list), + struct nf_hook_ops, list); + if (next->priority <= NF_BR_PRI_BRNF) + continue; + } + + if (&elem->list == head) + return okfn(net, sk, skb); + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1, + NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); + + ret = nf_hook_slow(skb, &state); + rcu_read_unlock(); + if (ret == 1) + ret = okfn(net, sk, skb); + + return ret; +} + #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 5e59a8457e7b..5989661c659f 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); @@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, + skb->dev, NULL, br_handle_frame_finish); return 0; } -- cgit v1.2.3 From fe72926b792e52ab00abfa81a201805bfb2247d6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Sep 2016 11:35:02 -0400 Subject: netfilter: call nf_hook_state_init with rcu_read_lock held This makes things simpler because we can store the head of the list in the nf_state structure without worrying about concurrent add/delete of hook elements from the list. A future commit will make use of this to implement a simpler linked-list. Signed-off-by: Florian Westphal Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 8 +++++++- include/linux/netfilter_ingress.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9230f9aee896..ad444f0b4ed0 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -174,10 +174,16 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, if (!list_empty(hook_list)) { struct nf_hook_state state; + int ret; + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); nf_hook_state_init(&state, hook_list, hook, thresh, pf, indev, outdev, sk, net, okfn); - return nf_hook_slow(skb, &state); + + ret = nf_hook_slow(skb, &state); + rcu_read_unlock(); + return ret; } return 1; } diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 5fcd375ef175..6965ba09eba7 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -14,6 +14,7 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb) return !list_empty(&skb->dev->nf_hooks_ingress); } +/* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_state state; -- cgit v1.2.3 From 57494343cb5d66962bb197878fb1cc576177db31 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:27 +0100 Subject: rxrpc: Implement slow-start Implement RxRPC slow-start, which is similar to RFC 5681 for TCP. A tracepoint is added to log the state of the congestion management algorithm and the decisions it makes. Notes: (1) Since we send fixed-size DATA packets (apart from the final packet in each phase), counters and calculations are in terms of packets rather than bytes. (2) The ACK packet carries the equivalent of TCP SACK. (3) The FLIGHT_SIZE calculation in RFC 5681 doesn't seem particularly suited to SACK of a small number of packets. It seems that, almost inevitably, by the time three 'duplicate' ACKs have been seen, we have narrowed the loss down to one or two missing packets, and the FLIGHT_SIZE calculation ends up as 2. (4) In rxrpc_resend(), if there was no data that apparently needed retransmission, we transmit a PING ACK to ask the peer to tell us what its Rx window state is. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 45 ++++++++++++ net/rxrpc/ar-internal.h | 53 +++++++++++++- net/rxrpc/call_event.c | 36 ++++++++- net/rxrpc/call_object.c | 13 ++++ net/rxrpc/conn_event.c | 1 + net/rxrpc/input.c | 169 +++++++++++++++++++++++++++++++++++++++++-- net/rxrpc/misc.c | 19 +++++ net/rxrpc/output.c | 9 ++- net/rxrpc/sendmsg.c | 7 +- 9 files changed, 339 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 56475497043d..ada12d00118c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -570,6 +570,51 @@ TRACE_EVENT(rxrpc_retransmit, __entry->expiry) ); +TRACE_EVENT(rxrpc_congest, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + rxrpc_serial_t ack_serial, enum rxrpc_congest_change change), + + TP_ARGS(call, summary, ack_serial, change), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(enum rxrpc_congest_change, change ) + __field(rxrpc_seq_t, hard_ack ) + __field(rxrpc_seq_t, top ) + __field(rxrpc_seq_t, lowest_nak ) + __field(rxrpc_serial_t, ack_serial ) + __field_struct(struct rxrpc_ack_summary, sum ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->change = change; + __entry->hard_ack = call->tx_hard_ack; + __entry->top = call->tx_top; + __entry->lowest_nak = call->acks_lowest_nak; + __entry->ack_serial = ack_serial; + memcpy(&__entry->sum, summary, sizeof(__entry->sum)); + ), + + TP_printk("c=%p %08x %s %08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + __entry->call, + __entry->ack_serial, + rxrpc_ack_names[__entry->sum.ack_reason], + __entry->hard_ack, + rxrpc_congest_modes[__entry->sum.mode], + __entry->sum.cwnd, + __entry->sum.ssthresh, + __entry->sum.nr_acks, __entry->sum.nr_nacks, + __entry->sum.nr_new_acks, __entry->sum.nr_new_nacks, + __entry->sum.nr_rot_new_acks, + __entry->top - __entry->hard_ack, + __entry->sum.cumulative_acks, + __entry->sum.dup_acks, + __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "", + rxrpc_congest_changes[__entry->change], + __entry->sum.retrans_timeo ? " rTxTo" : "") + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b1e697fc9ffb..ca96e547cb9a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -402,6 +402,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_PINGING, /* Ping in process */ + RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ }; /* @@ -446,6 +447,17 @@ enum rxrpc_call_completion { NR__RXRPC_CALL_COMPLETIONS }; +/* + * Call Tx congestion management modes. + */ +enum rxrpc_congest_mode { + RXRPC_CALL_SLOW_START, + RXRPC_CALL_CONGEST_AVOIDANCE, + RXRPC_CALL_PACKET_LOSS, + RXRPC_CALL_FAST_RETRANSMIT, + NR__RXRPC_CONGEST_MODES +}; + /* * RxRPC call definition * - matched by { connection, call_id } @@ -518,6 +530,20 @@ struct rxrpc_call { * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + + /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS + * is fixed, we keep these numbers in terms of segments (ie. DATA + * packets) rather than bytes. + */ +#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN + u8 cong_cwnd; /* Congestion window size */ + u8 cong_extra; /* Extra to send for congestion management */ + u8 cong_ssthresh; /* Slow-start threshold */ + enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ + u8 cong_dup_acks; /* Count of ACKs showing missing packets */ + u8 cong_cumul_acks; /* Cumulative ACK count */ + ktime_t cong_tstamp; /* Last time cwnd was changed */ + rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not * consumed packet follows this. */ @@ -539,12 +565,13 @@ struct rxrpc_call { ktime_t ackr_ping_time; /* Time last ping sent */ /* transmission-phase ACK management */ + ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ }; /* - * Summary of a new ACK and the changes it made. + * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { u8 ack_reason; @@ -554,6 +581,14 @@ struct rxrpc_ack_summary { u8 nr_new_nacks; /* Number of new NACKs in packet */ u8 nr_rot_new_acks; /* Number of rotated new ACKs */ bool new_low_nack; /* T if new low NACK found */ + bool retrans_timeo; /* T if reTx due to timeout happened */ + u8 flight_size; /* Number of unreceived transmissions */ + /* Place to stash values for tracing */ + enum rxrpc_congest_mode mode:8; + u8 cwnd; + u8 ssthresh; + u8 dup_acks; + u8 cumulative_acks; }; enum rxrpc_skb_trace { @@ -709,6 +744,7 @@ extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; enum rxrpc_propose_ack_trace { rxrpc_propose_ack_client_tx_end, rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_lost_ack, rxrpc_propose_ack_ping_for_lost_reply, rxrpc_propose_ack_ping_for_params, rxrpc_propose_ack_respond_to_ack, @@ -729,6 +765,21 @@ enum rxrpc_propose_ack_outcome { extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; +enum rxrpc_congest_change { + rxrpc_cong_begin_retransmission, + rxrpc_cong_cleared_nacks, + rxrpc_cong_new_low_nack, + rxrpc_cong_no_change, + rxrpc_cong_progress, + rxrpc_cong_retransmit_again, + rxrpc_cong_rtt_window_end, + rxrpc_cong_saw_nack, + rxrpc_congest__nr_change +}; + +extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10]; +extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9]; + extern const char *const rxrpc_pkts[]; extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 05b94d1acf52..0e8478012212 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -146,6 +146,14 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, spin_unlock_bh(&call->lock); } +/* + * Handle congestion being detected by the retransmit timeout. + */ +static void rxrpc_congestion_timeout(struct rxrpc_call *call) +{ + set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); +} + /* * Perform retransmission of NAK'd and unack'd packets. */ @@ -154,9 +162,9 @@ static void rxrpc_resend(struct rxrpc_call *call) struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; - ktime_t now = ktime_get_real(), max_age, oldest, resend_at; + ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts; int ix; - u8 annotation, anno_type; + u8 annotation, anno_type, retrans = 0, unacked = 0; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); @@ -193,10 +201,13 @@ static void rxrpc_resend(struct rxrpc_call *call) oldest = skb->tstamp; continue; } + if (!(annotation & RXRPC_TX_ANNO_RESENT)) + unacked++; } /* Okay, we need to retransmit a packet. */ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; + retrans++; trace_rxrpc_retransmit(call, seq, annotation | anno_type, ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } @@ -210,6 +221,25 @@ static void rxrpc_resend(struct rxrpc_call *call) * reached the nsec timeout yet. */ + if (unacked) + rxrpc_congestion_timeout(call); + + /* If there was nothing that needed retransmission then it's likely + * that an ACK got lost somewhere. Send a ping to find out instead of + * retransmitting data. + */ + if (!retrans) { + rxrpc_set_timer(call, rxrpc_timer_set_for_resend); + spin_unlock_bh(&call->lock); + ack_ts = ktime_sub(now, call->acks_latest_ts); + if (ktime_to_ns(ack_ts) < call->peer->rtt) + goto out; + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_lost_ack); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + goto out; + } + /* Now go through the Tx window and perform the retransmissions. We * have to drop the lock for each send. If an ACK comes in whilst the * lock is dropped, it may clear some of the retransmission markers for @@ -260,6 +290,7 @@ static void rxrpc_resend(struct rxrpc_call *call) out_unlock: spin_unlock_bh(&call->lock); +out: _leave(""); } @@ -293,6 +324,7 @@ recheck_state: if (time_after_eq(now, call->expire_at)) { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); + goto recheck_state; } if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a53f4c2c0025..d4b3293b78fa 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -160,6 +160,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; call->rx_expect_next = 1; + + if (RXRPC_TX_SMSS > 2190) + call->cong_cwnd = 2; + else if (RXRPC_TX_SMSS > 1095) + call->cong_cwnd = 3; + else + call->cong_cwnd = 4; + call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; return call; nomem_2: @@ -176,6 +184,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; + ktime_t now; _enter(""); @@ -185,6 +194,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->service_id = srx->srx_service; call->tx_phase = true; + now = ktime_get_real(); + call->acks_latest_ts = now; + call->cong_tstamp = now; _leave(" = %p", call); return call; @@ -325,6 +337,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->state = RXRPC_CALL_SERVER_ACCEPTING; if (sp->hdr.securityIndex > 0) call->state = RXRPC_CALL_SERVER_SECURING; + call->cong_tstamp = skb->tstamp; /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a1cf1ec5f29e..37609ce89f52 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -97,6 +97,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.info.maxMTU = htonl(mtu); pkt.info.rwind = htonl(rxrpc_rx_window_size); pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + pkt.whdr.flags |= RXRPC_SLOW_START_OK; len += sizeof(pkt.ack) + sizeof(pkt.info); break; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 0344f4494eb7..094720dd1eaf 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -36,6 +36,166 @@ static void rxrpc_proto_abort(const char *why, } } +/* + * Do TCP-style congestion management [RFC 5681]. + */ +static void rxrpc_congestion_management(struct rxrpc_call *call, + struct sk_buff *skb, + struct rxrpc_ack_summary *summary) +{ + enum rxrpc_congest_change change = rxrpc_cong_no_change; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int cumulative_acks = call->cong_cumul_acks; + unsigned int cwnd = call->cong_cwnd; + bool resend = false; + + summary->flight_size = + (call->tx_top - call->tx_hard_ack) - summary->nr_acks; + + if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { + summary->retrans_timeo = true; + call->cong_ssthresh = max_t(unsigned int, + summary->flight_size / 2, 2); + cwnd = 1; + if (cwnd > call->cong_ssthresh && + call->cong_mode == RXRPC_CALL_SLOW_START) { + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_tstamp = skb->tstamp; + cumulative_acks = 0; + } + } + + cumulative_acks += summary->nr_new_acks; + cumulative_acks += summary->nr_rot_new_acks; + if (cumulative_acks > 255) + cumulative_acks = 255; + + summary->mode = call->cong_mode; + summary->cwnd = call->cong_cwnd; + summary->ssthresh = call->cong_ssthresh; + summary->cumulative_acks = cumulative_acks; + summary->dup_acks = call->cong_dup_acks; + + switch (call->cong_mode) { + case RXRPC_CALL_SLOW_START: + if (summary->nr_nacks > 0) + goto packet_loss_detected; + if (summary->cumulative_acks > 0) + cwnd += 1; + if (cwnd > call->cong_ssthresh) { + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_tstamp = skb->tstamp; + } + goto out; + + case RXRPC_CALL_CONGEST_AVOIDANCE: + if (summary->nr_nacks > 0) + goto packet_loss_detected; + + /* We analyse the number of packets that get ACK'd per RTT + * period and increase the window if we managed to fill it. + */ + if (call->peer->rtt_usage == 0) + goto out; + if (ktime_before(skb->tstamp, + ktime_add_ns(call->cong_tstamp, + call->peer->rtt))) + goto out_no_clear_ca; + change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = skb->tstamp; + if (cumulative_acks >= cwnd) + cwnd++; + goto out; + + case RXRPC_CALL_PACKET_LOSS: + if (summary->nr_nacks == 0) + goto resume_normality; + + if (summary->new_low_nack) { + change = rxrpc_cong_new_low_nack; + call->cong_dup_acks = 1; + if (call->cong_extra > 1) + call->cong_extra = 1; + goto send_extra_data; + } + + call->cong_dup_acks++; + if (call->cong_dup_acks < 3) + goto send_extra_data; + + change = rxrpc_cong_begin_retransmission; + call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; + call->cong_ssthresh = max_t(unsigned int, + summary->flight_size / 2, 2); + cwnd = call->cong_ssthresh + 3; + call->cong_extra = 0; + call->cong_dup_acks = 0; + resend = true; + goto out; + + case RXRPC_CALL_FAST_RETRANSMIT: + if (!summary->new_low_nack) { + if (summary->nr_new_acks == 0) + cwnd += 1; + call->cong_dup_acks++; + if (call->cong_dup_acks == 2) { + change = rxrpc_cong_retransmit_again; + call->cong_dup_acks = 0; + resend = true; + } + } else { + change = rxrpc_cong_progress; + cwnd = call->cong_ssthresh; + if (summary->nr_nacks == 0) + goto resume_normality; + } + goto out; + + default: + BUG(); + goto out; + } + +resume_normality: + change = rxrpc_cong_cleared_nacks; + call->cong_dup_acks = 0; + call->cong_extra = 0; + call->cong_tstamp = skb->tstamp; + if (cwnd <= call->cong_ssthresh) + call->cong_mode = RXRPC_CALL_SLOW_START; + else + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; +out: + cumulative_acks = 0; +out_no_clear_ca: + if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1) + cwnd = RXRPC_RXTX_BUFF_SIZE - 1; + call->cong_cwnd = cwnd; + call->cong_cumul_acks = cumulative_acks; + trace_rxrpc_congest(call, summary, sp->hdr.serial, change); + if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); + return; + +packet_loss_detected: + change = rxrpc_cong_saw_nack; + call->cong_mode = RXRPC_CALL_PACKET_LOSS; + call->cong_dup_acks = 0; + goto send_extra_data; + +send_extra_data: + /* Send some previously unsent DATA if we have some to advance the ACK + * state. + */ + if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & + RXRPC_TX_ANNO_LAST || + summary->nr_acks != call->tx_top - call->tx_hard_ack) { + call->cong_extra++; + wake_up(&call->waitq); + } + goto out_no_clear_ca; +} + /* * Ping the other end to fill our RTT cache and to retrieve the rwind * and MTU parameters. @@ -524,7 +684,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, rxrpc_seq_t seq, int nr_acks, struct rxrpc_ack_summary *summary) { - bool resend = false; int ix; u8 annotation, anno_type; @@ -556,16 +715,11 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, continue; call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK | annotation; - resend = true; break; default: return rxrpc_proto_abort("SFT", call, 0); } } - - if (resend && - !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call); } /* @@ -663,6 +817,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, sp->hdr.serial, call->acks_latest); return; } + call->acks_latest_ts = skb->tstamp; call->acks_latest = sp->hdr.serial; if (before(hard_ack, call->tx_hard_ack) || @@ -692,6 +847,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, false, true, rxrpc_propose_ack_ping_for_lost_reply); + + return rxrpc_congestion_management(call, skb, &summary); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index a608769343e6..aedb8978226d 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -200,6 +200,7 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { [rxrpc_propose_ack_client_tx_end] = "ClTxEnd", [rxrpc_propose_ack_input_data] = "DataIn ", + [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck", [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", [rxrpc_propose_ack_ping_for_params] = "Params ", [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", @@ -214,3 +215,21 @@ const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { [rxrpc_propose_ack_update] = " Update", [rxrpc_propose_ack_subsume] = " Subsume", }; + +const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = { + [RXRPC_CALL_SLOW_START] = "SlowStart", + [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid", + [RXRPC_CALL_PACKET_LOSS] = "PktLoss ", + [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ", +}; + +const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = { + [rxrpc_cong_begin_retransmission] = " Retrans", + [rxrpc_cong_cleared_nacks] = " Cleared", + [rxrpc_cong_new_low_nack] = " NewLowN", + [rxrpc_cong_no_change] = "", + [rxrpc_cong_progress] = " Progres", + [rxrpc_cong_retransmit_again] = " ReTxAgn", + [rxrpc_cong_rtt_window_end] = " RttWinE", + [rxrpc_cong_saw_nack] = " SawNack", +}; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3eb01445e814..cf43a715685e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -157,6 +157,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) spin_unlock_bh(&call->lock); + pkt->whdr.flags |= RXRPC_SLOW_START_OK; + iov[0].iov_len += sizeof(pkt->ack) + n; iov[1].iov_base = &pkt->ackinfo; iov[1].iov_len = sizeof(pkt->ackinfo); @@ -276,8 +278,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) msg.msg_controllen = 0; msg.msg_flags = 0; - /* If our RTT cache needs working on, request an ACK. */ - if ((call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || + /* If our RTT cache needs working on, request an ACK. Also request + * ACKs if a DATA packet appears to have been lost. + */ + if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT || + (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) whdr.flags |= RXRPC_REQUEST_ACK; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 99939372b5a4..1f8040d82395 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -45,7 +45,9 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, for (;;) { set_current_state(TASK_INTERRUPTIBLE); ret = 0; - if (call->tx_top - call->tx_hard_ack < call->tx_winsize) + if (call->tx_top - call->tx_hard_ack < + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) break; if (call->state >= RXRPC_CALL_COMPLETE) { ret = -call->error; @@ -203,7 +205,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); if (call->tx_top - call->tx_hard_ack >= - call->tx_winsize) { + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; -- cgit v1.2.3 From 54f17bbc52f71e2d313721046627c383d6c5c7da Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:06 -0400 Subject: netfilter: nf_queue: whitespace cleanup A future patch will modify the hook drop and outfn functions. This will cause the line lengths to take up too much space. This is simply a readability change. Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 903dca050fb6..8fe85b98b5c8 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -22,10 +22,10 @@ struct nf_queue_entry { /* Packet queuing */ struct nf_queue_handler { - int (*outfn)(struct nf_queue_entry *entry, - unsigned int queuenum); - void (*nf_hook_drop)(struct net *net, - struct nf_hook_ops *ops); + int (*outfn)(struct nf_queue_entry *entry, + unsigned int queuenum); + void (*nf_hook_drop)(struct net *net, + struct nf_hook_ops *ops); }; void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh); -- cgit v1.2.3 From e3b37f11e6e4e6b6f02cc762f182ce233d2c1c9d Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:07 -0400 Subject: netfilter: replace list_head with single linked list The netfilter hook list never uses the prev pointer, and so can be trimmed to be a simple singly-linked list. In addition to having a more light weight structure for hook traversal, struct net becomes 5568 bytes (down from 6400) and struct net_device becomes 2176 bytes (down from 2240). Signed-off-by: Aaron Conole Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 2 +- include/linux/netfilter.h | 63 +++++++++-------- include/linux/netfilter_ingress.h | 17 +++-- include/net/netfilter/nf_queue.h | 3 +- include/net/netns/netfilter.h | 2 +- net/bridge/br_netfilter_hooks.c | 19 ++--- net/netfilter/core.c | 141 +++++++++++++++++++++++++------------- net/netfilter/nf_internals.h | 10 +-- net/netfilter/nf_queue.c | 18 ++--- net/netfilter/nfnetlink_queue.c | 8 ++- 10 files changed, 167 insertions(+), 116 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bb978470dc..41f49f5ab62a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1783,7 +1783,7 @@ struct net_device { #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS - struct list_head nf_hooks_ingress; + struct nf_hook_entry __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index ad444f0b4ed0..44e20dac98a9 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -55,12 +55,34 @@ struct nf_hook_state { struct net_device *out; struct sock *sk; struct net *net; - struct list_head *hook_list; + struct nf_hook_entry __rcu *hook_entries; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; +typedef unsigned int nf_hookfn(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state); +struct nf_hook_ops { + struct list_head list; + + /* User fills in from here down. */ + nf_hookfn *hook; + struct net_device *dev; + void *priv; + u_int8_t pf; + unsigned int hooknum; + /* Hooks are ordered in ascending priority. */ + int priority; +}; + +struct nf_hook_entry { + struct nf_hook_entry __rcu *next; + struct nf_hook_ops ops; + const struct nf_hook_ops *orig_ops; +}; + static inline void nf_hook_state_init(struct nf_hook_state *p, - struct list_head *hook_list, + struct nf_hook_entry *hook_entry, unsigned int hook, int thresh, u_int8_t pf, struct net_device *indev, @@ -76,26 +98,11 @@ static inline void nf_hook_state_init(struct nf_hook_state *p, p->out = outdev; p->sk = sk; p->net = net; - p->hook_list = hook_list; + RCU_INIT_POINTER(p->hook_entries, hook_entry); p->okfn = okfn; } -typedef unsigned int nf_hookfn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state); - -struct nf_hook_ops { - struct list_head list; - /* User fills in from here down. */ - nf_hookfn *hook; - struct net_device *dev; - void *priv; - u_int8_t pf; - unsigned int hooknum; - /* Hooks are ordered in ascending priority. */ - int priority; -}; struct nf_sockopt_ops { struct list_head list; @@ -161,7 +168,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct net *, struct sock *, struct sk_buff *), int thresh) { - struct list_head *hook_list; + struct nf_hook_entry *hook_head; + int ret = 1; #ifdef HAVE_JUMP_LABEL if (__builtin_constant_p(pf) && @@ -170,22 +178,19 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, return 1; #endif - hook_list = &net->nf.hooks[pf][hook]; - - if (!list_empty(hook_list)) { + rcu_read_lock(); + hook_head = rcu_dereference(net->nf.hooks[pf][hook]); + if (hook_head) { struct nf_hook_state state; - int ret; - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - nf_hook_state_init(&state, hook_list, hook, thresh, + nf_hook_state_init(&state, hook_head, hook, thresh, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state); - rcu_read_unlock(); - return ret; } - return 1; + rcu_read_unlock(); + + return ret; } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 6965ba09eba7..33e37fb41d5d 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -11,23 +11,30 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb) if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif - return !list_empty(&skb->dev->nf_hooks_ingress); + return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { + struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; - nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, - NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, - skb->dev, NULL, NULL, dev_net(skb->dev), NULL); + /* Must recheck the ingress hook head, in the event it became NULL + * after the check in nf_hook_ingress_active evaluated to true. + */ + if (unlikely(!e)) + return 0; + + nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN, + NFPROTO_NETDEV, skb->dev, NULL, NULL, + dev_net(skb->dev), NULL); return nf_hook_slow(skb, &state); } static inline void nf_hook_ingress_init(struct net_device *dev) { - INIT_LIST_HEAD(&dev->nf_hooks_ingress); + RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 8fe85b98b5c8..2280cfe86c56 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -11,7 +11,6 @@ struct nf_queue_entry { struct sk_buff *skb; unsigned int id; - struct nf_hook_ops *elem; struct nf_hook_state state; u16 size; /* sizeof(entry) + saved route keys */ @@ -25,7 +24,7 @@ struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); void (*nf_hook_drop)(struct net *net, - struct nf_hook_ops *ops); + const struct nf_hook_entry *hooks); }; void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh); diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 36d723579af2..58487b1cc99a 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -16,6 +16,6 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif - struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; + struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; }; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6029af47377d..2fe9345c1407 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1002,28 +1002,21 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { - struct nf_hook_ops *elem; + struct nf_hook_entry *elem; struct nf_hook_state state; - struct list_head *head; int ret; - head = &net->nf.hooks[NFPROTO_BRIDGE][hook]; + elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); - list_for_each_entry_rcu(elem, head, list) { - struct nf_hook_ops *next; + while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) + elem = rcu_dereference(elem->next); - next = list_entry_rcu(list_next_rcu(&elem->list), - struct nf_hook_ops, list); - if (next->priority <= NF_BR_PRI_BRNF) - continue; - } - - if (&elem->list == head) + if (!elem) return okfn(net, sk, skb); /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1, + nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 67b74287535d..72fc514ec676 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -61,33 +62,50 @@ EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); +#define nf_entry_dereference(e) \ + rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) -static struct list_head *nf_find_hook_list(struct net *net, - const struct nf_hook_ops *reg) +static struct nf_hook_entry *nf_hook_entry_head(struct net *net, + const struct nf_hook_ops *reg) { - struct list_head *hook_list = NULL; + struct nf_hook_entry *hook_head = NULL; if (reg->pf != NFPROTO_NETDEV) - hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_head = nf_entry_dereference(net->nf.hooks[reg->pf] + [reg->hooknum]); else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - hook_list = ®->dev->nf_hooks_ingress; + hook_head = + nf_entry_dereference( + reg->dev->nf_hooks_ingress); #endif } - return hook_list; + return hook_head; } -struct nf_hook_entry { - const struct nf_hook_ops *orig_ops; - struct nf_hook_ops ops; -}; +/* must hold nf_hook_mutex */ +static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg, + struct nf_hook_entry *entry) +{ + switch (reg->pf) { + case NFPROTO_NETDEV: + /* We already checked in nf_register_net_hook() that this is + * used from ingress. + */ + rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry); + break; + default: + rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum], + entry); + break; + } +} int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; + struct nf_hook_entry *hooks_entry; struct nf_hook_entry *entry; - struct nf_hook_ops *elem; if (reg->pf == NFPROTO_NETDEV && (reg->hooknum != NF_NETDEV_INGRESS || @@ -100,19 +118,30 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; + entry->next = NULL; + + mutex_lock(&nf_hook_mutex); + hooks_entry = nf_hook_entry_head(net, reg); - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) { - kfree(entry); - return -ENOENT; + if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) { + /* This is the case where we need to insert at the head */ + entry->next = hooks_entry; + hooks_entry = NULL; } - mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - if (reg->priority < elem->priority) - break; + while (hooks_entry && + reg->priority >= hooks_entry->orig_ops->priority && + nf_entry_dereference(hooks_entry->next)) { + hooks_entry = nf_entry_dereference(hooks_entry->next); + } + + if (hooks_entry) { + entry->next = nf_entry_dereference(hooks_entry->next); + rcu_assign_pointer(hooks_entry->next, entry); + } else { + nf_set_hooks_head(net, reg, entry); } - list_add_rcu(&entry->ops.list, elem->list.prev); + mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -127,24 +156,33 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; - struct nf_hook_entry *entry; - struct nf_hook_ops *elem; - - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) - return; + struct nf_hook_entry *hooks_entry; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - entry = container_of(elem, struct nf_hook_entry, ops); - if (entry->orig_ops == reg) { - list_del_rcu(&entry->ops.list); - break; + hooks_entry = nf_hook_entry_head(net, reg); + if (hooks_entry->orig_ops == reg) { + nf_set_hooks_head(net, reg, + nf_entry_dereference(hooks_entry->next)); + goto unlock; + } + while (hooks_entry && nf_entry_dereference(hooks_entry->next)) { + struct nf_hook_entry *next = + nf_entry_dereference(hooks_entry->next); + struct nf_hook_entry *nnext; + + if (next->orig_ops != reg) { + hooks_entry = next; + continue; } + nnext = nf_entry_dereference(next->next); + rcu_assign_pointer(hooks_entry->next, nnext); + hooks_entry = next; + break; } + +unlock: mutex_unlock(&nf_hook_mutex); - if (&elem->list == hook_list) { + if (!hooks_entry) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } @@ -156,10 +194,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, &entry->ops); + nf_queue_nf_hook_drop(net, hooks_entry); /* other cpu might still process nfqueue verdict that used reg */ synchronize_net(); - kfree(entry); + kfree(hooks_entry); } EXPORT_SYMBOL(nf_unregister_net_hook); @@ -258,10 +296,9 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); -unsigned int nf_iterate(struct list_head *head, - struct sk_buff *skb, +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_ops **elemp) + struct nf_hook_entry **entryp) { unsigned int verdict; @@ -269,20 +306,23 @@ unsigned int nf_iterate(struct list_head *head, * The caller must not block between calls to this * function because of risk of continuing from deleted element. */ - list_for_each_entry_continue_rcu((*elemp), head, list) { - if (state->thresh > (*elemp)->priority) + while (*entryp) { + if (state->thresh > (*entryp)->ops.priority) { + *entryp = rcu_dereference((*entryp)->next); continue; + } /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook((*elemp)->priv, skb, state); + verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - (*elemp)->hook, state->hook); + (*entryp)->ops.hook, state->hook); + *entryp = rcu_dereference((*entryp)->next); continue; } #endif @@ -290,6 +330,7 @@ repeat: return verdict; goto repeat; } + *entryp = rcu_dereference((*entryp)->next); } return NF_ACCEPT; } @@ -299,13 +340,13 @@ repeat: * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) { - struct nf_hook_ops *elem; + struct nf_hook_entry *entry; unsigned int verdict; int ret = 0; - elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); + entry = rcu_dereference(state->hook_entries); next_hook: - verdict = nf_iterate(state->hook_list, skb, state, &elem); + verdict = nf_iterate(skb, state, &entry); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -314,8 +355,10 @@ next_hook: if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - int err = nf_queue(skb, elem, state, - verdict >> NF_VERDICT_QBITS); + int err; + + RCU_INIT_POINTER(state->hook_entries, entry); + err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -442,7 +485,7 @@ static int __net_init netfilter_net_init(struct net *net) for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&net->nf.hooks[i][h]); + RCU_INIT_POINTER(net->nf.hooks[i][h], NULL); } #ifdef CONFIG_PROC_FS diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 065522564ac6..e0adb5959342 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,13 +13,13 @@ /* core.c */ -unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - struct nf_hook_state *state, struct nf_hook_ops **elemp); +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, + struct nf_hook_entry **entryp); /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, - struct nf_hook_state *state, unsigned int queuenum); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); +int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, + unsigned int queuenum); +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b19ad20a705c..96964a0070e1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) - qh->nf_hook_drop(net, ops); + qh->nf_hook_drop(net, entry); rcu_read_unlock(); } @@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) * through nf_reinject(). */ int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum) { @@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb, *entry = (struct nf_queue_entry) { .skb = skb, - .elem = elem, .state = *state, .size = sizeof(*entry) + afinfo->route_key_size, }; @@ -165,11 +163,15 @@ err: void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { + struct nf_hook_entry *hook_entry; struct sk_buff *skb = entry->skb; - struct nf_hook_ops *elem = entry->elem; const struct nf_afinfo *afinfo; + struct nf_hook_ops *elem; int err; + hook_entry = rcu_dereference(entry->state.hook_entries); + elem = &hook_entry->ops; + nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ @@ -186,8 +188,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(entry->state.hook_list, - skb, &entry->state, &elem); + verdict = nf_iterate(skb, &entry->state, &hook_entry); } switch (verdict & NF_VERDICT_MASK) { @@ -198,7 +199,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, elem, &entry->state, + RCU_INIT_POINTER(entry->state.hook_entries, hook_entry); + err = nf_queue(skb, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7caa8b082c41..af832c526048 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; -static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr) { - return entry->elem == (struct nf_hook_ops *)ops_ptr; + return rcu_access_pointer(entry->state.hook_entries) == + (struct nf_hook_entry *)entry_ptr; } -static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +static void nfqnl_nf_hook_drop(struct net *net, + const struct nf_hook_entry *hook) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; -- cgit v1.2.3 From 11d5f15723c9f39d7c131d0149d024c17dbef676 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Thu, 22 Sep 2016 12:43:44 -0400 Subject: netfilter: xt_hashlimit: Create revision 2 to support higher pps rates Create a new revision for the hashlimit iptables extension module. Rev 2 will support higher pps of upto 1 million, Version 1 supports only 10k. To support this we have to increase the size of the variables avg and burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2 and xt_hashlimit_mtinfo2 and also create newer versions of all the functions for match, checkentry and destroy. Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very similar in both rev1 and rev2 with only minor changes, so I have split those functions and moved all the common code to a *_common function. Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_hashlimit.h | 23 ++ net/netfilter/xt_hashlimit.c | 330 ++++++++++++++++++++++------ 2 files changed, 285 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index 6db90372f09c..3efc0ca18345 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -6,6 +6,7 @@ /* timings are in milliseconds. */ #define XT_HASHLIMIT_SCALE 10000 +#define XT_HASHLIMIT_SCALE_v2 1000000llu /* 1/10,000 sec period => max of 10,000/sec. Min rate is then 429490 * seconds, or one packet every 59 hours. */ @@ -63,6 +64,20 @@ struct hashlimit_cfg1 { __u8 srcmask, dstmask; }; +struct hashlimit_cfg2 { + __u64 avg; /* Average secs between packets * scale */ + __u64 burst; /* Period multiplier for upper limit. */ + __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */ + + /* user specified */ + __u32 size; /* how many buckets */ + __u32 max; /* max number of entries */ + __u32 gc_interval; /* gc interval */ + __u32 expire; /* when do entries expire? */ + + __u8 srcmask, dstmask; +}; + struct xt_hashlimit_mtinfo1 { char name[IFNAMSIZ]; struct hashlimit_cfg1 cfg; @@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 { struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); }; +struct xt_hashlimit_mtinfo2 { + char name[NAME_MAX]; + struct hashlimit_cfg2 cfg; + + /* Used internally by the kernel */ + struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_HASHLIMIT_H */ diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index e93d9e0a3f35..44a095ecc7b7 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net) /* need to declare this at the top */ static const struct file_operations dl_file_ops_v1; +static const struct file_operations dl_file_ops; /* hash table crap */ struct dsthash_dst { @@ -86,8 +87,8 @@ struct dsthash_ent { unsigned long expires; /* precalculated expiry time */ struct { unsigned long prev; /* last modification */ - u_int32_t credit; - u_int32_t credit_cap, cost; + u_int64_t credit; + u_int64_t credit_cap, cost; } rateinfo; struct rcu_head rcu; }; @@ -98,7 +99,7 @@ struct xt_hashlimit_htable { u_int8_t family; bool rnd_initialized; - struct hashlimit_cfg1 cfg; /* config */ + struct hashlimit_cfg2 cfg; /* config */ /* used internally */ spinlock_t lock; /* lock for list_head */ @@ -114,6 +115,30 @@ struct xt_hashlimit_htable { struct hlist_head hash[0]; /* hashtable itself */ }; +static int +cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision) +{ + if (revision == 1) { + struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from; + + to->mode = cfg->mode; + to->avg = cfg->avg; + to->burst = cfg->burst; + to->size = cfg->size; + to->max = cfg->max; + to->gc_interval = cfg->gc_interval; + to->expire = cfg->expire; + to->srcmask = cfg->srcmask; + to->dstmask = cfg->dstmask; + } else if (revision == 2) { + memcpy(to, from, sizeof(struct hashlimit_cfg2)); + } else { + return -EINVAL; + } + + return 0; +} + static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ static struct kmem_cache *hashlimit_cachep __read_mostly; @@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(struct work_struct *work); -static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, - u_int8_t family) +static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, + const char *name, u_int8_t family, + struct xt_hashlimit_htable **out_hinfo, + int revision) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; - unsigned int size; - unsigned int i; + unsigned int size, i; + int ret; - if (minfo->cfg.size) { - size = minfo->cfg.size; + if (cfg->size) { + size = cfg->size; } else { size = (totalram_pages << PAGE_SHIFT) / 16384 / sizeof(struct list_head); @@ -238,10 +265,14 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, sizeof(struct list_head) * size); if (hinfo == NULL) return -ENOMEM; - minfo->hinfo = hinfo; + *out_hinfo = hinfo; /* copy match config into hashtable config */ - memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2); + + if (ret) + return ret; + hinfo->cfg.size = size; if (hinfo->cfg.max == 0) hinfo->cfg.max = 8 * hinfo->cfg.size; @@ -255,17 +286,18 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; - hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { vfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); - hinfo->pde = proc_create_data(minfo->name, 0, + hinfo->pde = proc_create_data(name, 0, (family == NFPROTO_IPV4) ? hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, - &dl_file_ops_v1, hinfo); + (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops, + hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); vfree(hinfo); @@ -399,6 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. */ #define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24)) +#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24)) /* Repeated shift and or gives us all 1s, final shift and add 1 gives * us the power of 2 below the theoretical max, so GCC simply does a @@ -408,8 +441,11 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32)) #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) +#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1) +#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ) #define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1) /* in byte mode, the lowest possible rate is one packet/second. @@ -425,15 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len) } /* Precision saver. */ -static u32 user2credits(u32 user) +static u64 user2credits(u64 user, int revision) { - /* If multiplying would overflow... */ - if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) - /* Divide first. */ - return (user / XT_HASHLIMIT_SCALE) *\ - HZ * CREDITS_PER_JIFFY_v1; + if (revision == 1) { + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) + /* Divide first. */ + return (user / XT_HASHLIMIT_SCALE) *\ + HZ * CREDITS_PER_JIFFY_v1; + + return (user * HZ * CREDITS_PER_JIFFY_v1) \ + / XT_HASHLIMIT_SCALE; + } else { + if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + return (user / XT_HASHLIMIT_SCALE_v2) *\ + HZ * CREDITS_PER_JIFFY; - return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE; + return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2; + } } static u32 user2credits_byte(u32 user) @@ -443,10 +488,11 @@ static u32 user2credits_byte(u32 user) return (u32) (us >> 32); } -static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, + u32 mode, int revision) { unsigned long delta = now - dh->rateinfo.prev; - u32 cap; + u64 cap, cpj; if (delta == 0) return; @@ -454,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) dh->rateinfo.prev = now; if (mode & XT_HASHLIMIT_BYTES) { - u32 tmp = dh->rateinfo.credit; + u64 tmp = dh->rateinfo.credit; dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; cap = CREDITS_PER_JIFFY_BYTES * HZ; if (tmp >= dh->rateinfo.credit) {/* overflow */ @@ -462,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) return; } } else { - dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1; + cpj = (revision == 1) ? + CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY; + dh->rateinfo.credit += delta * cpj; cap = dh->rateinfo.credit_cap; } if (dh->rateinfo.credit > cap) @@ -470,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) } static void rateinfo_init(struct dsthash_ent *dh, - struct xt_hashlimit_htable *hinfo) + struct xt_hashlimit_htable *hinfo, int revision) { dh->rateinfo.prev = jiffies; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { @@ -479,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh, dh->rateinfo.credit_cap = hinfo->cfg.burst; } else { dh->rateinfo.credit = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + hinfo->cfg.burst, revision); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision); dh->rateinfo.credit_cap = dh->rateinfo.credit; } } @@ -604,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) } static bool -hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, + struct xt_hashlimit_htable *hinfo, + const struct hashlimit_cfg2 *cfg, int revision) { - const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; - struct xt_hashlimit_htable *hinfo = info->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; bool race = false; - u32 cost; + u64 cost; if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; @@ -627,18 +675,18 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) } else if (race) { /* Already got an entry, update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now, hinfo->cfg.mode); + rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } else { dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_init(dh, hinfo); + rateinfo_init(dh, hinfo, revision); } } else { /* update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now, hinfo->cfg.mode); + rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } - if (info->cfg.mode & XT_HASHLIMIT_BYTES) + if (cfg->mode & XT_HASHLIMIT_BYTES) cost = hashlimit_byte_cost(skb->len, dh); else cost = dh->rateinfo.cost; @@ -648,70 +696,126 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) dh->rateinfo.credit -= cost; spin_unlock(&dh->lock); rcu_read_unlock_bh(); - return !(info->cfg.mode & XT_HASHLIMIT_INVERT); + return !(cfg->mode & XT_HASHLIMIT_INVERT); } spin_unlock(&dh->lock); rcu_read_unlock_bh(); /* default match is underlimit - so over the limit, we need to invert */ - return info->cfg.mode & XT_HASHLIMIT_INVERT; + return cfg->mode & XT_HASHLIMIT_INVERT; hotdrop: par->hotdrop = true; return false; } -static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) +static bool +hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + struct hashlimit_cfg2 cfg = {}; + int ret; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 1); + + if (ret) + return ret; + + return hashlimit_mt_common(skb, par, hinfo, &cfg, 1); +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + + return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2); +} + +static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, + struct xt_hashlimit_htable **hinfo, + struct hashlimit_cfg2 *cfg, + const char *name, int revision) { struct net *net = par->net; - struct xt_hashlimit_mtinfo1 *info = par->matchinfo; int ret; - if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) - return -EINVAL; - if (info->name[sizeof(info->name)-1] != '\0') + if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; if (par->family == NFPROTO_IPV4) { - if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) + if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; } else { - if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) + if (cfg->srcmask > 128 || cfg->dstmask > 128) return -EINVAL; } - if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { + if (cfg->mode & ~XT_HASHLIMIT_ALL) { pr_info("Unknown mode mask %X, kernel too old?\n", - info->cfg.mode); + cfg->mode); return -EINVAL; } /* Check for overflow. */ - if (info->cfg.mode & XT_HASHLIMIT_BYTES) { - if (user2credits_byte(info->cfg.avg) == 0) { - pr_info("overflow, rate too high: %u\n", info->cfg.avg); + if (cfg->mode & XT_HASHLIMIT_BYTES) { + if (user2credits_byte(cfg->avg) == 0) { + pr_info("overflow, rate too high: %llu\n", cfg->avg); return -EINVAL; } - } else if (info->cfg.burst == 0 || - user2credits(info->cfg.avg * info->cfg.burst) < - user2credits(info->cfg.avg)) { - pr_info("overflow, try lower: %u/%u\n", - info->cfg.avg, info->cfg.burst); + } else if (cfg->burst == 0 || + user2credits(cfg->avg * cfg->burst, revision) < + user2credits(cfg->avg, revision)) { + pr_info("overflow, try lower: %llu/%llu\n", + cfg->avg, cfg->burst); return -ERANGE; } mutex_lock(&hashlimit_mutex); - info->hinfo = htable_find_get(net, info->name, par->family); - if (info->hinfo == NULL) { - ret = htable_create_v1(net, info, par->family); + *hinfo = htable_find_get(net, name, par->family); + if (*hinfo == NULL) { + ret = htable_create(net, cfg, name, par->family, + hinfo, revision); if (ret < 0) { mutex_unlock(&hashlimit_mutex); return ret; } } mutex_unlock(&hashlimit_mutex); + return 0; } +static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct hashlimit_cfg2 cfg = {}; + int ret; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 1); + + if (ret) + return ret; + + return hashlimit_mt_check_common(par, &info->hinfo, + &cfg, info->name, 1); +} + +static int hashlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, + info->name, 2); +} + static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; @@ -719,6 +823,13 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) htable_put(info->hinfo); } +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + htable_put(info->hinfo); +} + static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", @@ -730,6 +841,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, + { + .name = "hashlimit", + .revision = 2, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo2), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "hashlimit", @@ -741,6 +862,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, + { + .name = "hashlimit", + .revision = 2, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo2), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, #endif }; @@ -787,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v) spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, - struct seq_file *s) +static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) { - const struct xt_hashlimit_htable *ht = s->private; - - spin_lock(&ent->lock); - /* recalculate to show accurate numbers */ - rateinfo_recalc(ent, jiffies, ht->cfg.mode); - switch (family) { case NFPROTO_IPV4: - seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", + seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip.src, ntohs(ent->dst.src_port), @@ -809,7 +934,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: - seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", + seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip6.src, ntohs(ent->dst.src_port), @@ -822,6 +947,34 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, default: BUG(); } +} + +static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1); + + dl_seq_print(ent, family, s); + + spin_unlock(&ent->lock); + return seq_has_overflowed(s); +} + +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2); + + dl_seq_print(ent, family, s); + spin_unlock(&ent->lock); return seq_has_overflowed(s); } @@ -840,6 +993,20 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) return 0; } +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + + if (!hlist_empty(&htable->hash[*bucket])) { + hlist_for_each_entry(ent, &htable->hash[*bucket], node) + if (dl_seq_real_show(ent, htable->family, s)) + return -1; + } + return 0; +} + static const struct seq_operations dl_seq_ops_v1 = { .start = dl_seq_start, .next = dl_seq_next, @@ -847,6 +1014,13 @@ static const struct seq_operations dl_seq_ops_v1 = { .show = dl_seq_show_v1 }; +static const struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + static int dl_proc_open_v1(struct inode *inode, struct file *file) { int ret = seq_open(file, &dl_seq_ops_v1); @@ -858,6 +1032,18 @@ static int dl_proc_open_v1(struct inode *inode, struct file *file) return ret; } +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + + sf->private = PDE_DATA(inode); + } + return ret; +} + static const struct file_operations dl_file_ops_v1 = { .owner = THIS_MODULE, .open = dl_proc_open_v1, @@ -866,6 +1052,14 @@ static const struct file_operations dl_file_ops_v1 = { .release = seq_release }; +static const struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + static int __net_init hashlimit_proc_net_init(struct net *net) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); -- cgit v1.2.3 From 0f3cd9b3697708c86a825ae3cedabf7be6fd3e72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Sep 2016 15:23:33 +0200 Subject: netfilter: nf_tables: add range expression Inverse ranges != [a,b] are not currently possible because rules are composites of && operations, and we need to express this: data < a || data > b This patch adds a new range expression. Positive ranges can be already through two cmp expressions: cmp(sreg, data, >=) cmp(sreg, data, <=) This new range expression provides an alternative way to express this. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 3 + include/uapi/linux/netfilter/nf_tables.h | 29 +++++++ net/netfilter/Makefile | 3 +- net/netfilter/nf_tables_core.c | 7 +- net/netfilter/nft_range.c | 138 +++++++++++++++++++++++++++++++ 5 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 net/netfilter/nft_range.c (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index a9060dd99db7..00f4f6b1b1ba 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -28,6 +28,9 @@ extern const struct nft_expr_ops nft_cmp_fast_ops; int nft_cmp_module_init(void); void nft_cmp_module_exit(void); +int nft_range_module_init(void); +void nft_range_module_exit(void); + int nft_lookup_module_init(void); void nft_lookup_module_exit(void); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 1cf41dd838b2..c6c4477c136b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -546,6 +546,35 @@ enum nft_cmp_attributes { }; #define NFTA_CMP_MAX (__NFTA_CMP_MAX - 1) +/** + * enum nft_range_ops - nf_tables range operator + * + * @NFT_RANGE_EQ: equal + * @NFT_RANGE_NEQ: not equal + */ +enum nft_range_ops { + NFT_RANGE_EQ, + NFT_RANGE_NEQ, +}; + +/** + * enum nft_range_attributes - nf_tables range expression netlink attributes + * + * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers) + * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops) + * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes) + * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes) + */ +enum nft_range_attributes { + NFTA_RANGE_UNSPEC, + NFTA_RANGE_SREG, + NFTA_RANGE_OP, + NFTA_RANGE_FROM_DATA, + NFTA_RANGE_TO_DATA, + __NFTA_RANGE_MAX +}; +#define NFTA_RANGE_MAX (__NFTA_RANGE_MAX - 1) + enum nft_lookup_flags { NFT_LOOKUP_F_INV = (1 << 0), }; diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0c8581100ac6..c23c3c84416f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o # nf_tables nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o +nf_tables-objs += nft_lookup.o nft_dynset.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 67259cefef06..7c94ce0080d5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -263,8 +263,13 @@ int __init nf_tables_core_module_init(void) if (err < 0) goto err7; - return 0; + err = nft_range_module_init(); + if (err < 0) + goto err8; + return 0; +err8: + nft_dynset_module_exit(); err7: nft_payload_module_exit(); err6: diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c new file mode 100644 index 000000000000..c6d5358482d1 --- /dev/null +++ b/net/netfilter/nft_range.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_range_expr { + struct nft_data data_from; + struct nft_data data_to; + enum nft_registers sreg:8; + u8 len; + enum nft_range_ops op:8; +}; + +static void nft_range_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_range_expr *priv = nft_expr_priv(expr); + bool mismatch; + int d1, d2; + + d1 = memcmp(®s->data[priv->sreg], &priv->data_from, priv->len); + d2 = memcmp(®s->data[priv->sreg], &priv->data_to, priv->len); + switch (priv->op) { + case NFT_RANGE_EQ: + mismatch = (d1 < 0 || d2 > 0); + break; + case NFT_RANGE_NEQ: + mismatch = (d1 >= 0 && d2 <= 0); + break; + } + + if (mismatch) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { + [NFTA_RANGE_SREG] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, + [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_range_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc_from, desc_to; + int err; + + err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), + &desc_from, tb[NFTA_RANGE_FROM_DATA]); + if (err < 0) + return err; + + err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), + &desc_to, tb[NFTA_RANGE_TO_DATA]); + if (err < 0) + goto err1; + + if (desc_from.len != desc_to.len) { + err = -EINVAL; + goto err2; + } + + priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); + err = nft_validate_register_load(priv->sreg, desc_from.len); + if (err < 0) + goto err2; + + priv->op = ntohl(nla_get_be32(tb[NFTA_RANGE_OP])); + priv->len = desc_from.len; + return 0; +err2: + nft_data_uninit(&priv->data_to, desc_to.type); +err1: + nft_data_uninit(&priv->data_from, desc_from.type); + return err; +} + +static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_range_expr *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from, + NFT_DATA_VALUE, priv->len) < 0 || + nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_range_type; +static const struct nft_expr_ops nft_range_ops = { + .type = &nft_range_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)), + .eval = nft_range_eval, + .init = nft_range_init, + .dump = nft_range_dump, +}; + +static struct nft_expr_type nft_range_type __read_mostly = { + .name = "range", + .ops = &nft_range_ops, + .policy = nft_range_policy, + .maxattr = NFTA_RANGE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_range_module_init(void) +{ + return nft_register_expr(&nft_range_type); +} + +void nft_range_module_exit(void) +{ + nft_unregister_expr(&nft_range_type); +} -- cgit v1.2.3 From ff107d27761ff4b644c82c209e004ec9c8fbbc22 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 25 Sep 2016 16:35:56 +0800 Subject: netfilter: nft_log: complete NFTA_LOG_FLAGS attr support NFTA_LOG_FLAGS attribute is already supported, but the related NF_LOG_XXX flags are not exposed to the userspace. So we cannot explicitly enable log flags to log uid, tcp sequence, ip options and so on, i.e. such rule "nft add rule filter output log uid" is not supported yet. So move NF_LOG_XXX macro definitions to the uapi/../nf_log.h. In order to keep consistent with other modules, change NF_LOG_MASK to refer to all supported log flags. On the other hand, add a new NF_LOG_DEFAULT_MASK to refer to the original default log flags. Finally, if user specify the unsupported log flags or NFTA_LOG_GROUP and NFTA_LOG_FLAGS are set at the same time, report EINVAL to the userspace. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 11 +++-------- include/uapi/linux/netfilter/nf_log.h | 12 ++++++++++++ net/bridge/netfilter/ebt_log.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/netfilter/nf_log_arp.c | 2 +- net/ipv4/netfilter/nf_log_ipv4.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 2 +- net/ipv6/netfilter/nf_log_ipv6.c | 4 ++-- net/netfilter/nf_tables_core.c | 2 +- net/netfilter/nft_log.c | 9 ++++++++- 10 files changed, 32 insertions(+), 18 deletions(-) create mode 100644 include/uapi/linux/netfilter/nf_log.h (limited to 'include') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index ee07dc8b0a7b..309cd267be4f 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -2,15 +2,10 @@ #define _NF_LOG_H #include +#include -/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will - * disappear once iptables is replaced with pkttables. Please DO NOT use them - * for any new code! */ -#define NF_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ -#define NF_LOG_TCPOPT 0x02 /* Log TCP options */ -#define NF_LOG_IPOPT 0x04 /* Log IP options */ -#define NF_LOG_UID 0x08 /* Log UID owning local socket */ -#define NF_LOG_MASK 0x0f +/* Log tcp sequence, tcp options, ip options and uid owning local socket */ +#define NF_LOG_DEFAULT_MASK 0x0f /* This flag indicates that copy_len field in nf_loginfo is set */ #define NF_LOG_F_COPY_LEN 0x1 diff --git a/include/uapi/linux/netfilter/nf_log.h b/include/uapi/linux/netfilter/nf_log.h new file mode 100644 index 000000000000..8be21e02387d --- /dev/null +++ b/include/uapi/linux/netfilter/nf_log.h @@ -0,0 +1,12 @@ +#ifndef _NETFILTER_NF_LOG_H +#define _NETFILTER_NF_LOG_H + +#define NF_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ +#define NF_LOG_TCPOPT 0x02 /* Log TCP options */ +#define NF_LOG_IPOPT 0x04 /* Log IP options */ +#define NF_LOG_UID 0x08 /* Log UID owning local socket */ +#define NF_LOG_NFLOG 0x10 /* Unsupported, don't reuse */ +#define NF_LOG_MACDECODE 0x20 /* Decode MAC header */ +#define NF_LOG_MASK 0x2f + +#endif /* _NETFILTER_NF_LOG_H */ diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 152300d164ac..9a11086ba6ff 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, if (loginfo->type == NF_LOG_TYPE_LOG) bitmask = loginfo->u.log.logflags; else - bitmask = NF_LOG_MASK; + bitmask = NF_LOG_DEFAULT_MASK; if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f993545a3373..7c00ce90adb8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = 4, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 8945c2653814..b24795e2ee6d 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 20f225593a8b..5b571e1b5f15 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else - logflags = NF_LOG_MASK; + logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 552fac2f390a..55aacea24396 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index c1bcf699a23d..f6aee2895fee 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else - logflags = NF_LOG_MASK; + logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 7c94ce0080d5..0dd5c695482f 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 24a73bb26e94..1b01404bb33f 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx, if (tb[NFTA_LOG_LEVEL] != NULL && tb[NFTA_LOG_GROUP] != NULL) return -EINVAL; - if (tb[NFTA_LOG_GROUP] != NULL) + if (tb[NFTA_LOG_GROUP] != NULL) { li->type = NF_LOG_TYPE_ULOG; + if (tb[NFTA_LOG_FLAGS] != NULL) + return -EINVAL; + } nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx, if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); + if (li->u.log.logflags & ~NF_LOG_MASK) { + err = -EINVAL; + goto err1; + } } break; case NF_LOG_TYPE_ULOG: -- cgit v1.2.3 From a7c7fbff6a408d00431c705bbe3dfc5f51e3f1c4 Mon Sep 17 00:00:00 2001 From: Purushottam Kushwaha Date: Wed, 14 Sep 2016 17:38:44 +0530 Subject: cfg80211: Add support to configure a beacon data rate This allows an option to configure a single beacon tx rate for an AP. Signed-off-by: Purushottam Kushwaha Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 25 +-- net/wireless/nl80211.c | 510 ++++++++++++++++++++++++++++--------------------- 2 files changed, 302 insertions(+), 233 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bd26cc6e2d79..e0949c8bc2d1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -676,6 +676,18 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; +/* + * cfg80211_bitrate_mask - masks for bitrate control + */ +struct cfg80211_bitrate_mask { + struct { + u32 legacy; + u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; + u16 vht_mcs[NL80211_VHT_NSS_MAX]; + enum nl80211_txrate_gi gi; + } control[NUM_NL80211_BANDS]; +}; + /** * struct cfg80211_ap_settings - AP configuration * @@ -700,6 +712,7 @@ struct cfg80211_acl_data { * MAC address based access control * @pbss: If set, start as a PCP instead of AP. Relevant for DMG * networks. + * @beacon_rate: masks for setting user configured beacon tx rate. */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -719,6 +732,7 @@ struct cfg80211_ap_settings { bool p2p_opp_ps; const struct cfg80211_acl_data *acl; bool pbss; + struct cfg80211_bitrate_mask beacon_rate; }; /** @@ -2010,17 +2024,6 @@ enum wiphy_params_flags { WIPHY_PARAM_DYN_ACK = 1 << 5, }; -/* - * cfg80211_bitrate_mask - masks for bitrate control - */ -struct cfg80211_bitrate_mask { - struct { - u32 legacy; - u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; - u16 vht_mcs[NL80211_VHT_NSS_MAX]; - enum nl80211_txrate_gi gi; - } control[NUM_NL80211_BANDS]; -}; /** * struct cfg80211_pmksa - PMK Security Association * diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 887c4c114206..a10484da60c0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3340,6 +3340,279 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info) return err; } +static u32 rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len) +{ + u8 i; + u32 mask = 0; + + for (i = 0; i < rates_len; i++) { + int rate = (rates[i] & 0x7f) * 5; + int ridx; + + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { + struct ieee80211_rate *srate = + &sband->bitrates[ridx]; + if (rate == srate->bitrate) { + mask |= 1 << ridx; + break; + } + } + if (ridx == sband->n_bitrates) + return 0; /* rate not found */ + } + + return mask; +} + +static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len, + u8 mcs[IEEE80211_HT_MCS_MASK_LEN]) +{ + u8 i; + + memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN); + + for (i = 0; i < rates_len; i++) { + int ridx, rbit; + + ridx = rates[i] / 8; + rbit = BIT(rates[i] % 8); + + /* check validity */ + if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN)) + return false; + + /* check availability */ + if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) + mcs[ridx] |= rbit; + else + return false; + } + + return true; +} + +static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map) +{ + u16 mcs_mask = 0; + + switch (vht_mcs_map) { + case IEEE80211_VHT_MCS_NOT_SUPPORTED: + break; + case IEEE80211_VHT_MCS_SUPPORT_0_7: + mcs_mask = 0x00FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_8: + mcs_mask = 0x01FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_9: + mcs_mask = 0x03FF; + break; + default: + break; + } + + return mcs_mask; +} + +static void vht_build_mcs_mask(u16 vht_mcs_map, + u16 vht_mcs_mask[NL80211_VHT_NSS_MAX]) +{ + u8 nss; + + for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) { + vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03); + vht_mcs_map >>= 2; + } +} + +static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, + struct nl80211_txrate_vht *txrate, + u16 mcs[NL80211_VHT_NSS_MAX]) +{ + u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {}; + u8 i; + + if (!sband->vht_cap.vht_supported) + return false; + + memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX); + + /* Build vht_mcs_mask from VHT capabilities */ + vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask); + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)}, + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, +}; + +static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, + struct cfg80211_bitrate_mask *mask) +{ + struct nlattr *tb[NL80211_TXRATE_MAX + 1]; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int rem, i; + struct nlattr *tx_rates; + struct ieee80211_supported_band *sband; + u16 vht_tx_mcs_map; + + memset(mask, 0, sizeof(*mask)); + /* Default to all rates enabled */ + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sband = rdev->wiphy.bands[i]; + + if (!sband) + continue; + + mask->control[i].legacy = (1 << sband->n_bitrates) - 1; + memcpy(mask->control[i].ht_mcs, + sband->ht_cap.mcs.rx_mask, + sizeof(mask->control[i].ht_mcs)); + + if (!sband->vht_cap.vht_supported) + continue; + + vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + } + + /* if no rates are given set it back to the defaults */ + if (!info->attrs[NL80211_ATTR_TX_RATES]) + goto out; + + /* The nested attribute uses enum nl80211_band as the index. This maps + * directly to the enum nl80211_band values used in cfg80211. + */ + BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); + nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { + enum nl80211_band band = nla_type(tx_rates); + int err; + + if (band < 0 || band >= NUM_NL80211_BANDS) + return -EINVAL; + sband = rdev->wiphy.bands[band]; + if (sband == NULL) + return -EINVAL; + err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), + nla_len(tx_rates), nl80211_txattr_policy); + if (err) + return err; + if (tb[NL80211_TXRATE_LEGACY]) { + mask->control[band].legacy = rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_LEGACY]), + nla_len(tb[NL80211_TXRATE_LEGACY])); + if ((mask->control[band].legacy == 0) && + nla_len(tb[NL80211_TXRATE_LEGACY])) + return -EINVAL; + } + if (tb[NL80211_TXRATE_HT]) { + if (!ht_rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_HT]), + nla_len(tb[NL80211_TXRATE_HT]), + mask->control[band].ht_mcs)) + return -EINVAL; + } + if (tb[NL80211_TXRATE_VHT]) { + if (!vht_set_mcs_mask( + sband, + nla_data(tb[NL80211_TXRATE_VHT]), + mask->control[band].vht_mcs)) + return -EINVAL; + } + if (tb[NL80211_TXRATE_GI]) { + mask->control[band].gi = + nla_get_u8(tb[NL80211_TXRATE_GI]); + if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI) + return -EINVAL; + } + + if (mask->control[band].legacy == 0) { + /* don't allow empty legacy rates if HT or VHT + * are not even supported. + */ + if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || + rdev->wiphy.bands[band]->vht_cap.vht_supported)) + return -EINVAL; + + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) + if (mask->control[band].ht_mcs[i]) + goto out; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + if (mask->control[band].vht_mcs[i]) + goto out; + + /* legacy and mcs rates may not be both empty */ + return -EINVAL; + } + } + +out: + return 0; +} + +static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params) +{ + u32 rate, count_ht, count_vht, i; + enum nl80211_band band; + + band = params->chandef.chan->band; + rate = params->beacon_rate.control[band].legacy; + + /* Allow only one rate */ + if (hweight32(rate) > 1) + return -EINVAL; + + count_ht = 0; + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { + if (hweight8(params->beacon_rate.control[band].ht_mcs[i]) > 1) { + return -EINVAL; + } else if (params->beacon_rate.control[band].ht_mcs[i]) { + count_ht++; + if (count_ht > 1) + return -EINVAL; + } + if (count_ht && rate) + return -EINVAL; + } + + count_vht = 0; + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + if (hweight16(params->beacon_rate.control[band].vht_mcs[i]) > 1) { + return -EINVAL; + } else if (params->beacon_rate.control[band].vht_mcs[i]) { + count_vht++; + if (count_vht > 1) + return -EINVAL; + } + if (count_vht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht)) + return -EINVAL; + + return 0; +} + static int nl80211_parse_beacon(struct nlattr *attrs[], struct cfg80211_beacon_data *bcn) { @@ -3569,6 +3842,16 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev->iftype)) return -EINVAL; + if (info->attrs[NL80211_ATTR_TX_RATES]) { + err = nl80211_parse_tx_bitrate_mask(info, ¶ms.beacon_rate); + if (err) + return err; + + err = validate_beacon_tx_rate(¶ms); + if (err) + return err; + } + if (info->attrs[NL80211_ATTR_SMPS_MODE]) { params.smps_mode = nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); @@ -8641,238 +8924,21 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, return rdev_cancel_remain_on_channel(rdev, wdev, cookie); } -static u32 rateset_to_mask(struct ieee80211_supported_band *sband, - u8 *rates, u8 rates_len) -{ - u8 i; - u32 mask = 0; - - for (i = 0; i < rates_len; i++) { - int rate = (rates[i] & 0x7f) * 5; - int ridx; - - for (ridx = 0; ridx < sband->n_bitrates; ridx++) { - struct ieee80211_rate *srate = - &sband->bitrates[ridx]; - if (rate == srate->bitrate) { - mask |= 1 << ridx; - break; - } - } - if (ridx == sband->n_bitrates) - return 0; /* rate not found */ - } - - return mask; -} - -static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, - u8 *rates, u8 rates_len, - u8 mcs[IEEE80211_HT_MCS_MASK_LEN]) -{ - u8 i; - - memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN); - - for (i = 0; i < rates_len; i++) { - int ridx, rbit; - - ridx = rates[i] / 8; - rbit = BIT(rates[i] % 8); - - /* check validity */ - if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN)) - return false; - - /* check availability */ - if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) - mcs[ridx] |= rbit; - else - return false; - } - - return true; -} - -static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map) -{ - u16 mcs_mask = 0; - - switch (vht_mcs_map) { - case IEEE80211_VHT_MCS_NOT_SUPPORTED: - break; - case IEEE80211_VHT_MCS_SUPPORT_0_7: - mcs_mask = 0x00FF; - break; - case IEEE80211_VHT_MCS_SUPPORT_0_8: - mcs_mask = 0x01FF; - break; - case IEEE80211_VHT_MCS_SUPPORT_0_9: - mcs_mask = 0x03FF; - break; - default: - break; - } - - return mcs_mask; -} - -static void vht_build_mcs_mask(u16 vht_mcs_map, - u16 vht_mcs_mask[NL80211_VHT_NSS_MAX]) -{ - u8 nss; - - for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) { - vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03); - vht_mcs_map >>= 2; - } -} - -static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, - struct nl80211_txrate_vht *txrate, - u16 mcs[NL80211_VHT_NSS_MAX]) -{ - u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); - u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {}; - u8 i; - - if (!sband->vht_cap.vht_supported) - return false; - - memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX); - - /* Build vht_mcs_mask from VHT capabilities */ - vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask); - - for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { - if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) - mcs[i] = txrate->mcs[i]; - else - return false; - } - - return true; -} - -static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { - [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_HT] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)}, - [NL80211_TXRATE_GI] = { .type = NLA_U8 }, -}; - static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *tb[NL80211_TXRATE_MAX + 1]; - struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct cfg80211_bitrate_mask mask; - int rem, i; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - struct nlattr *tx_rates; - struct ieee80211_supported_band *sband; - u16 vht_tx_mcs_map; + int err; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; - memset(&mask, 0, sizeof(mask)); - /* Default to all rates enabled */ - for (i = 0; i < NUM_NL80211_BANDS; i++) { - sband = rdev->wiphy.bands[i]; - - if (!sband) - continue; - - mask.control[i].legacy = (1 << sband->n_bitrates) - 1; - memcpy(mask.control[i].ht_mcs, - sband->ht_cap.mcs.rx_mask, - sizeof(mask.control[i].ht_mcs)); - - if (!sband->vht_cap.vht_supported) - continue; - - vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); - vht_build_mcs_mask(vht_tx_mcs_map, mask.control[i].vht_mcs); - } - - /* if no rates are given set it back to the defaults */ - if (!info->attrs[NL80211_ATTR_TX_RATES]) - goto out; - - /* - * The nested attribute uses enum nl80211_band as the index. This maps - * directly to the enum nl80211_band values used in cfg80211. - */ - BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { - enum nl80211_band band = nla_type(tx_rates); - int err; - - if (band < 0 || band >= NUM_NL80211_BANDS) - return -EINVAL; - sband = rdev->wiphy.bands[band]; - if (sband == NULL) - return -EINVAL; - err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), - nla_len(tx_rates), nl80211_txattr_policy); - if (err) - return err; - if (tb[NL80211_TXRATE_LEGACY]) { - mask.control[band].legacy = rateset_to_mask( - sband, - nla_data(tb[NL80211_TXRATE_LEGACY]), - nla_len(tb[NL80211_TXRATE_LEGACY])); - if ((mask.control[band].legacy == 0) && - nla_len(tb[NL80211_TXRATE_LEGACY])) - return -EINVAL; - } - if (tb[NL80211_TXRATE_HT]) { - if (!ht_rateset_to_mask( - sband, - nla_data(tb[NL80211_TXRATE_HT]), - nla_len(tb[NL80211_TXRATE_HT]), - mask.control[band].ht_mcs)) - return -EINVAL; - } - if (tb[NL80211_TXRATE_VHT]) { - if (!vht_set_mcs_mask( - sband, - nla_data(tb[NL80211_TXRATE_VHT]), - mask.control[band].vht_mcs)) - return -EINVAL; - } - if (tb[NL80211_TXRATE_GI]) { - mask.control[band].gi = - nla_get_u8(tb[NL80211_TXRATE_GI]); - if (mask.control[band].gi > NL80211_TXRATE_FORCE_LGI) - return -EINVAL; - } - - if (mask.control[band].legacy == 0) { - /* don't allow empty legacy rates if HT or VHT - * are not even supported. - */ - if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || - rdev->wiphy.bands[band]->vht_cap.vht_supported)) - return -EINVAL; - - for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) - if (mask.control[band].ht_mcs[i]) - goto out; - - for (i = 0; i < NL80211_VHT_NSS_MAX; i++) - if (mask.control[band].vht_mcs[i]) - goto out; - - /* legacy and mcs rates may not be both empty */ - return -EINVAL; - } - } + err = nl80211_parse_tx_bitrate_mask(info, &mask); + if (err) + return err; -out: return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); } -- cgit v1.2.3 From 8564e38206de2ff005a27c8d7c2ce3869a44f0dd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Sep 2016 09:44:44 +0200 Subject: cfg80211: add checks for beacon rate, extend to mesh The previous commit added support for specifying the beacon rate for AP mode. Add features checks to this, and extend it to also support the rate configuration for mesh networks. For IBSS it's not as simple due to joining etc., so that's not yet supported. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 17 +++++++++++++++- net/wireless/nl80211.c | 46 +++++++++++++++++++++++++++++++++----------- 3 files changed, 54 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e0949c8bc2d1..ed37304fa09d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -712,7 +712,7 @@ struct cfg80211_bitrate_mask { * MAC address based access control * @pbss: If set, start as a PCP instead of AP. Relevant for DMG * networks. - * @beacon_rate: masks for setting user configured beacon tx rate. + * @beacon_rate: bitrate to be used for beacons */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1365,6 +1365,7 @@ struct mesh_config { * @beacon_interval: beacon interval to use * @mcast_rate: multicat rate for Mesh Node [6Mbps is the default for 802.11a] * @basic_rates: basic rates to use when creating the mesh + * @beacon_rate: bitrate to be used for beacons * * These parameters are fixed when the mesh is created. */ @@ -1385,6 +1386,7 @@ struct mesh_setup { u16 beacon_interval; int mcast_rate[NUM_NL80211_BANDS]; u32 basic_rates; + struct cfg80211_bitrate_mask beacon_rate; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 220694151434..ec10d1b2838f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1343,7 +1343,13 @@ enum nl80211_commands { * enum nl80211_band value is used as the index (nla_type() of the nested * data. If a band is not included, it will be configured to allow all * rates based on negotiated supported rates information. This attribute - * is used with %NL80211_CMD_SET_TX_BITRATE_MASK. + * is used with %NL80211_CMD_SET_TX_BITRATE_MASK and with starting AP, + * and joining mesh networks (not IBSS yet). In the later case, it must + * specify just a single bitrate, which is to be used for the beacon. + * The driver must also specify support for this with the extended + * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, + * NL80211_EXT_FEATURE_BEACON_RATE_HT and + * NL80211_EXT_FEATURE_BEACON_RATE_VHT. * * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. @@ -4551,6 +4557,12 @@ enum nl80211_feature_flags { * (if available). * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of * channel dwell time. + * @NL80211_EXT_FEATURE_BEACON_RATE_LEGACY: Driver supports beacon rate + * configuration (AP/mesh), supporting a legacy (non HT/VHT) rate. + * @NL80211_EXT_FEATURE_BEACON_RATE_HT: Driver supports beacon rate + * configuration (AP/mesh) with HT rates. + * @NL80211_EXT_FEATURE_BEACON_RATE_VHT: Driver supports beacon rate + * configuration (AP/mesh) with VHT rates. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4562,6 +4574,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCAN_START_TIME, NL80211_EXT_FEATURE_BSS_PARENT_TSF, NL80211_EXT_FEATURE_SET_SCAN_DWELL, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, + NL80211_EXT_FEATURE_BEACON_RATE_HT, + NL80211_EXT_FEATURE_BEACON_RATE_VHT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a10484da60c0..b8441e60b0f6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3569,13 +3569,12 @@ out: return 0; } -static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params) +static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, + enum nl80211_band band, + struct cfg80211_bitrate_mask *beacon_rate) { - u32 rate, count_ht, count_vht, i; - enum nl80211_band band; - - band = params->chandef.chan->band; - rate = params->beacon_rate.control[band].legacy; + u32 count_ht, count_vht, i; + u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ if (hweight32(rate) > 1) @@ -3583,9 +3582,9 @@ static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params) count_ht = 0; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { - if (hweight8(params->beacon_rate.control[band].ht_mcs[i]) > 1) { + if (hweight8(beacon_rate->control[band].ht_mcs[i]) > 1) { return -EINVAL; - } else if (params->beacon_rate.control[band].ht_mcs[i]) { + } else if (beacon_rate->control[band].ht_mcs[i]) { count_ht++; if (count_ht > 1) return -EINVAL; @@ -3596,9 +3595,9 @@ static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params) count_vht = 0; for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { - if (hweight16(params->beacon_rate.control[band].vht_mcs[i]) > 1) { + if (hweight16(beacon_rate->control[band].vht_mcs[i]) > 1) { return -EINVAL; - } else if (params->beacon_rate.control[band].vht_mcs[i]) { + } else if (beacon_rate->control[band].vht_mcs[i]) { count_vht++; if (count_vht > 1) return -EINVAL; @@ -3610,6 +3609,19 @@ static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params) if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht)) return -EINVAL; + if (rate && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) + return -EINVAL; + if (count_ht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_HT)) + return -EINVAL; + if (count_vht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_VHT)) + return -EINVAL; + return 0; } @@ -3847,7 +3859,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = validate_beacon_tx_rate(¶ms); + err = validate_beacon_tx_rate(rdev, params.chandef.chan->band, + ¶ms.beacon_rate); if (err) return err; } @@ -9406,6 +9419,17 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) return err; } + if (info->attrs[NL80211_ATTR_TX_RATES]) { + err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate); + if (err) + return err; + + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, + &setup.beacon_rate); + if (err) + return err; + } + return cfg80211_join_mesh(rdev, dev, &setup, &cfg); } -- cgit v1.2.3 From b90eb754949931b2e4481b1df9a03f84d4be66ba Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 26 Sep 2016 12:52:29 +0200 Subject: fib: introduce FIB notification infrastructure This allows to pass information about added/deleted FIB entries/rules to whoever is interested. This is done in a very similar way as devinet notifies address additions/removals. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 34 ++++++++++++++++++++++++--- net/ipv4/fib_frontend.c | 16 ++++++------- net/ipv4/fib_rules.c | 10 ++++++++ net/ipv4/fib_trie.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 108 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 7d4a72e75f33..116a9c0eb455 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -22,6 +22,7 @@ #include #include #include +#include struct fib_config { u8 fc_dst_len; @@ -185,6 +186,33 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); #define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ FIB_RES_SADDR(net, res)) +struct fib_notifier_info { + struct net *net; +}; + +struct fib_entry_notifier_info { + struct fib_notifier_info info; /* must be first */ + u32 dst; + int dst_len; + struct fib_info *fi; + u8 tos; + u8 type; + u32 tb_id; + u32 nlflags; +}; + +enum fib_event_type { + FIB_EVENT_ENTRY_ADD, + FIB_EVENT_ENTRY_DEL, + FIB_EVENT_RULE_ADD, + FIB_EVENT_RULE_DEL, +}; + +int register_fib_notifier(struct notifier_block *nb); +int unregister_fib_notifier(struct notifier_block *nb); +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info); + struct fib_table { struct hlist_node tb_hlist; u32 tb_id; @@ -196,11 +224,11 @@ struct fib_table { int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); -int fib_table_insert(struct fib_table *, struct fib_config *); -int fib_table_delete(struct fib_table *, struct fib_config *); +int fib_table_insert(struct net *, struct fib_table *, struct fib_config *); +int fib_table_delete(struct net *, struct fib_table *, struct fib_config *); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); -int fib_table_flush(struct fib_table *table); +int fib_table_flush(struct net *net, struct fib_table *table); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e56a4c20a3c..86c43dc9a60e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -182,7 +182,7 @@ static void fib_flush(struct net *net) struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) - flushed += fib_table_flush(tb); + flushed += fib_table_flush(net, tb); } if (flushed) @@ -590,13 +590,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = fib_table_delete(tb, &cfg); + err = fib_table_delete(net, tb, &cfg); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = fib_table_insert(tb, &cfg); + err = fib_table_insert(net, tb, &cfg); else err = -ENOBUFS; } @@ -719,7 +719,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = fib_table_delete(tb, &cfg); + err = fib_table_delete(net, tb, &cfg); errout: return err; } @@ -741,7 +741,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = fib_table_insert(tb, &cfg); + err = fib_table_insert(net, tb, &cfg); errout: return err; } @@ -828,9 +828,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - fib_table_insert(tb, &cfg); + fib_table_insert(net, tb, &cfg); else - fib_table_delete(tb, &cfg); + fib_table_delete(net, tb, &cfg); } void fib_add_ifaddr(struct in_ifaddr *ifa) @@ -1254,7 +1254,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); - fib_table_flush(tb); + fib_table_flush(net, tb); fib_free_table(tb); } } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 770bebed6b28..ebadf6b99499 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -164,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int call_fib_rule_notifiers(struct net *net, + enum fib_event_type event_type) +{ + struct fib_notifier_info info; + + return call_fib_notifiers(net, event_type, &info); +} + static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, [FRA_FLOW] = { .type = NLA_U32 }, @@ -221,6 +229,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, net->ipv4.fib_has_custom_rules = true; fib_flush_external(rule->fr_net); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); err = 0; errout: @@ -243,6 +252,7 @@ static int fib4_rule_delete(struct fib_rule *rule) #endif net->ipv4.fib_has_custom_rules = true; fib_flush_external(rule->fr_net); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); errout: return err; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 241f27bbd7ad..51a4537eb145 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,44 @@ #include #include "fib_lookup.h" +static BLOCKING_NOTIFIER_HEAD(fib_chain); + +int register_fib_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&fib_chain, nb); +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return blocking_notifier_call_chain(&fib_chain, event_type, info); +} + +static int call_fib_entry_notifiers(struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifiers(net, event_type, &info.info); +} + #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) @@ -1076,7 +1115,8 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, } /* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct net *net, struct fib_table *tb, + struct fib_config *cfg) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; @@ -1193,6 +1233,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); + + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, + key, plen, fi, + new_fa->fa_tos, cfg->fc_type, + tb->tb_id, cfg->fc_nlflags); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1245,6 +1290,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, + cfg->fc_type, tb->tb_id, cfg->fc_nlflags); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1490,7 +1537,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, } /* Caller must hold RTNL. */ -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct net *net, struct fib_table *tb, + struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; @@ -1546,6 +1594,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, cfg->fc_type, tb->tb_id); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, + fa_to_delete->fa_info, tos, cfg->fc_type, + tb->tb_id, 0); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1809,7 +1860,7 @@ void fib_table_flush_external(struct fib_table *tb) } /* Caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; @@ -1861,6 +1912,11 @@ int fib_table_flush(struct fib_table *tb) switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, fa->fa_type, tb->tb_id); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, + n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); -- cgit v1.2.3 From c98501879b1b1af90c7325574f2672e9efca592c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 26 Sep 2016 12:52:30 +0200 Subject: fib: introduce FIB info offload flag helpers These helpers are to be used in case someone offloads the FIB entry. The result is that if the entry is offloaded to at least one device, the offload flag is set. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 13 +++++++++++++ net/switchdev/switchdev.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 116a9c0eb455..ffccf1787914 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -123,6 +123,7 @@ struct fib_info { #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; #endif + unsigned int fib_offload_cnt; struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev @@ -174,6 +175,18 @@ struct fib_result_nl { __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); +static inline void fib_info_offload_inc(struct fib_info *fi) +{ + fi->fib_offload_cnt++; + fi->fib_flags |= RTNH_F_OFFLOAD; +} + +static inline void fib_info_offload_dec(struct fib_info *fi) +{ + if (--fi->fib_offload_cnt == 0) + fi->fib_flags &= ~RTNH_F_OFFLOAD; +} + #define FIB_RES_SADDR(net, res) \ ((FIB_RES_NH(res).nh_saddr_genid == \ atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 10b819308439..abd8d2a38a7d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1216,7 +1216,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, ipv4_fib.obj.orig_dev = dev; err = switchdev_port_obj_add(dev, &ipv4_fib.obj); if (!err) - fi->fib_flags |= RTNH_F_OFFLOAD; + fib_info_offload_inc(fi); return err == -EOPNOTSUPP ? 0 : err; } @@ -1260,7 +1260,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, ipv4_fib.obj.orig_dev = dev; err = switchdev_port_obj_del(dev, &ipv4_fib.obj); if (!err) - fi->fib_flags &= ~RTNH_F_OFFLOAD; + fib_info_offload_dec(fi); return err == -EOPNOTSUPP ? 0 : err; } -- cgit v1.2.3 From 347e3b28c1ba24c1ae2f30290d8247480ab9ce14 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 26 Sep 2016 12:52:33 +0200 Subject: switchdev: remove FIB offload infrastructure Since this is now taken care of by FIB notifier, remove the code, with all unused dependencies. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 - include/net/switchdev.h | 40 ---------- net/ipv4/fib_frontend.c | 13 ---- net/ipv4/fib_rules.c | 2 - net/ipv4/fib_trie.c | 104 +------------------------- net/switchdev/switchdev.c | 181 ---------------------------------------------- 6 files changed, 1 insertion(+), 341 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ffccf1787914..b9314b48e39f 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -243,7 +243,6 @@ int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); int fib_table_flush(struct net *net, struct fib_table *table); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); -void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -356,7 +355,6 @@ static inline int fib_num_tclassid_users(struct net *net) } #endif int fib_unmerge(struct net *net); -void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 729fe1534160..eba80c4fc56f 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -68,7 +68,6 @@ struct switchdev_attr { enum switchdev_obj_id { SWITCHDEV_OBJ_ID_UNDEFINED, SWITCHDEV_OBJ_ID_PORT_VLAN, - SWITCHDEV_OBJ_ID_IPV4_FIB, SWITCHDEV_OBJ_ID_PORT_FDB, SWITCHDEV_OBJ_ID_PORT_MDB, }; @@ -92,21 +91,6 @@ struct switchdev_obj_port_vlan { #define SWITCHDEV_OBJ_PORT_VLAN(obj) \ container_of(obj, struct switchdev_obj_port_vlan, obj) -/* SWITCHDEV_OBJ_ID_IPV4_FIB */ -struct switchdev_obj_ipv4_fib { - struct switchdev_obj obj; - u32 dst; - int dst_len; - struct fib_info *fi; - u8 tos; - u8 type; - u32 nlflags; - u32 tb_id; -}; - -#define SWITCHDEV_OBJ_IPV4_FIB(obj) \ - container_of(obj, struct switchdev_obj_ipv4_fib, obj) - /* SWITCHDEV_OBJ_ID_PORT_FDB */ struct switchdev_obj_port_fdb { struct switchdev_obj obj; @@ -209,11 +193,6 @@ int switchdev_port_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); int switchdev_port_bridge_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); -int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id); -int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id); -void switchdev_fib_ipv4_abort(struct fib_info *fi); int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlm_flags); @@ -304,25 +283,6 @@ static inline int switchdev_port_bridge_dellink(struct net_device *dev, return -EOPNOTSUPP; } -static inline int switchdev_fib_ipv4_add(u32 dst, int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 nlflags, u32 tb_id) -{ - return 0; -} - -static inline int switchdev_fib_ipv4_del(u32 dst, int dst_len, - struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) -{ - return 0; -} - -static inline void switchdev_fib_ipv4_abort(struct fib_info *fi) -{ -} - static inline int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 86c43dc9a60e..c3b80478226e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -189,19 +189,6 @@ static void fib_flush(struct net *net) rt_cache_flush(net); } -void fib_flush_external(struct net *net) -{ - struct fib_table *tb; - struct hlist_head *head; - unsigned int h; - - for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) - fib_table_flush_external(tb); - } -} - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index ebadf6b99499..2e50062f642d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -228,7 +228,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - fib_flush_external(rule->fr_net); call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); err = 0; @@ -251,7 +250,6 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - fib_flush_external(rule->fr_net); call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); errout: return err; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 51a4537eb145..31cef3602585 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,7 +81,6 @@ #include #include #include -#include #include #include "fib_lookup.h" @@ -1215,17 +1214,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - err = switchdev_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); - if (err) { - switchdev_fib_ipv4_abort(fi); - kmem_cache_free(fn_alias_kmem, new_fa); - goto out; - } - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); @@ -1273,18 +1261,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - /* (Optionally) offload fib entry to switch hardware. */ - err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, - cfg->fc_nlflags, tb->tb_id); - if (err) { - switchdev_fib_ipv4_abort(fi); - goto out_free_new_fa; - } - /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_sw_fib_del; + goto out_free_new_fa; if (!plen) tb->tb_num_default++; @@ -1297,8 +1277,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_sw_fib_del: - switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1591,9 +1569,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, fa_to_delete->fa_info, tos, cfg->fc_type, tb->tb_id, 0); @@ -1785,80 +1760,6 @@ out: return NULL; } -/* Caller must hold RTNL */ -void fib_table_flush_external(struct fib_table *tb) -{ - struct trie *t = (struct trie *)tb->tb_data; - struct key_vector *pn = t->kv; - unsigned long cindex = 1; - struct hlist_node *tmp; - struct fib_alias *fa; - - /* walk trie in reverse order */ - for (;;) { - unsigned char slen = 0; - struct key_vector *n; - - if (!(cindex--)) { - t_key pkey = pn->key; - - /* cannot resize the trie vector */ - if (IS_TRIE(pn)) - break; - - /* resize completed node */ - pn = resize(t, pn); - cindex = get_index(pkey, pn); - - continue; - } - - /* grab the next available node */ - n = get_child(pn, cindex); - if (!n) - continue; - - if (IS_TNODE(n)) { - /* record pn and cindex for leaf walking */ - pn = n; - cindex = 1ul << n->bits; - - continue; - } - - hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { - struct fib_info *fi = fa->fa_info; - - /* if alias was cloned to local then we just - * need to remove the local copy from main - */ - if (tb->tb_id != fa->tb_id) { - hlist_del_rcu(&fa->fa_list); - alias_free_mem_rcu(fa); - continue; - } - - /* record local slen */ - slen = fa->fa_slen; - - if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) - continue; - - switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); - } - - /* update leaf slen */ - n->slen = slen; - - if (hlist_empty(&n->leaf)) { - put_child_root(pn, n->key, NULL); - node_free(n); - } - } -} - /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb) { @@ -1909,9 +1810,6 @@ int fib_table_flush(struct net *net, struct fib_table *tb) continue; } - switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, KEYLENGTH - fa->fa_slen, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index abd8d2a38a7d..02beb35f577f 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -21,7 +21,6 @@ #include #include #include -#include #include /** @@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: return sizeof(struct switchdev_obj_port_vlan); - case SWITCHDEV_OBJ_ID_IPV4_FIB: - return sizeof(struct switchdev_obj_ipv4_fib); case SWITCHDEV_OBJ_ID_PORT_FDB: return sizeof(struct switchdev_obj_port_fdb); case SWITCHDEV_OBJ_ID_PORT_MDB: @@ -1108,184 +1105,6 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); -static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) -{ - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct net_device *port_dev; - struct list_head *iter; - - /* Recusively search down until we find a sw port dev. - * (A sw port dev supports switchdev_port_attr_get). - */ - - if (ops && ops->switchdev_port_attr_get) - return dev; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - port_dev = switchdev_get_lowest_dev(lower_dev); - if (port_dev) - return port_dev; - } - - return NULL; -} - -static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) -{ - struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - }; - struct switchdev_attr prev_attr; - struct net_device *dev = NULL; - int nhsel; - - ASSERT_RTNL(); - - /* For this route, all nexthop devs must be on the same switch. */ - - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (!nh->nh_dev) - return NULL; - - dev = switchdev_get_lowest_dev(nh->nh_dev); - if (!dev) - return NULL; - - attr.orig_dev = dev; - if (switchdev_port_attr_get(dev, &attr)) - return NULL; - - if (nhsel > 0 && - !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) - return NULL; - - prev_attr = attr; - } - - return dev; -} - -/** - * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry - * - * @dst: route's IPv4 destination address - * @dst_len: destination address length (prefix length) - * @fi: route FIB info structure - * @tos: route TOS - * @type: route type - * @nlflags: netlink flags passed in (NLM_F_*) - * @tb_id: route table ID - * - * Add/modify switch IPv4 route entry. - */ -int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id) -{ - struct switchdev_obj_ipv4_fib ipv4_fib = { - .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = nlflags, - .tb_id = tb_id, - }; - struct net_device *dev; - int err = 0; - - /* Don't offload route if using custom ip rules or if - * IPv4 FIB offloading has been disabled completely. - */ - -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (fi->fib_net->ipv4.fib_has_custom_rules) - return 0; -#endif - - if (fi->fib_net->ipv4.fib_offload_disabled) - return 0; - - dev = switchdev_get_dev_by_nhs(fi); - if (!dev) - return 0; - - ipv4_fib.obj.orig_dev = dev; - err = switchdev_port_obj_add(dev, &ipv4_fib.obj); - if (!err) - fib_info_offload_inc(fi); - - return err == -EOPNOTSUPP ? 0 : err; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); - -/** - * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch - * - * @dst: route's IPv4 destination address - * @dst_len: destination address length (prefix length) - * @fi: route FIB info structure - * @tos: route TOS - * @type: route type - * @tb_id: route table ID - * - * Delete IPv4 route entry from switch device. - */ -int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) -{ - struct switchdev_obj_ipv4_fib ipv4_fib = { - .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = 0, - .tb_id = tb_id, - }; - struct net_device *dev; - int err = 0; - - if (!(fi->fib_flags & RTNH_F_OFFLOAD)) - return 0; - - dev = switchdev_get_dev_by_nhs(fi); - if (!dev) - return 0; - - ipv4_fib.obj.orig_dev = dev; - err = switchdev_port_obj_del(dev, &ipv4_fib.obj); - if (!err) - fib_info_offload_dec(fi); - - return err == -EOPNOTSUPP ? 0 : err; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); - -/** - * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation - * - * @fi: route FIB info structure - */ -void switchdev_fib_ipv4_abort(struct fib_info *fi) -{ - /* There was a problem installing this route to the offload - * device. For now, until we come up with more refined - * policy handling, abruptly end IPv4 fib offloading for - * for entire net by flushing offload device(s) of all - * IPv4 routes, and mark IPv4 fib offloading broken from - * this point forward. - */ - - fib_flush_external(fi->fib_net); - fi->fib_net->ipv4.fib_offload_disabled = true; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); - bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { -- cgit v1.2.3 From fa5effe7664b1606dfd639dff58686f6dbb119d7 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Tue, 27 Sep 2016 11:09:51 +0300 Subject: net/sched: pkt_cls: change tc actions order to be as the user sets Currently the created tc actions list is reversed against the order set by the user. Change the actions list order to be the same as was set by the user. This patch doesn't affect dump actions behavior. For dumping, action->order parameter is used so the list order doesn't matter. Signed-off-by: Hadar Hen Zion Acked-by: Jamal Hadi Salim Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5ccaa4be7d96..767b03a3fe67 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -123,7 +123,7 @@ static inline void tcf_exts_to_list(const struct tcf_exts *exts, for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; - list_add(&a->list, actions); + list_add_tail(&a->list, actions); } #endif } -- cgit v1.2.3 From 484611357c19f9e19ef742ebef4505a07d243cc9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 28 Sep 2016 10:54:32 -0400 Subject: bpf: allow access into map value arrays Suppose you have a map array value that is something like this struct foo { unsigned iter; int array[SOME_CONSTANT]; }; You can easily insert this into an array, but you cannot modify the contents of foo->array[] after the fact. This is because we have no way to verify we won't go off the end of the array at verification time. This patch provides a start for this work. We accomplish this by keeping track of a minimum and maximum value a register could be while we're checking the code. Then at the time we try to do an access into a MAP_VALUE we verify that the maximum offset into that region is a valid access into that memory region. So in practice, code such as this unsigned index = 0; if (foo->iter >= SOME_CONSTANT) foo->iter = index; else index = foo->iter++; foo->array[index] = bar; would be allowed, as we can verify that index will always be between 0 and SOME_CONSTANT-1. If you wish to use signed values you'll have to have an extra check to make sure the index isn't less than 0, or do something like index %= SOME_CONSTANT. Signed-off-by: Josef Bacik Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 + include/linux/bpf_verifier.h | 12 ++ kernel/bpf/verifier.c | 329 ++++++++++++++++++++++++++++++++++++++++--- samples/bpf/libbpf.h | 8 ++ samples/bpf/test_verifier.c | 243 +++++++++++++++++++++++++++++++- 5 files changed, 577 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5691fdc83819..c201017b5730 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -139,6 +139,13 @@ enum bpf_reg_type { */ PTR_TO_PACKET, PTR_TO_PACKET_END, /* skb->data + headlen */ + + /* PTR_TO_MAP_VALUE_ADJ is used for doing pointer math inside of a map + * elem value. We only allow this if we can statically verify that + * access from this register are going to fall within the size of the + * map element. + */ + PTR_TO_MAP_VALUE_ADJ, }; struct bpf_prog; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c5cb661712c9..7035b997aaa5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -10,8 +10,19 @@ #include /* for enum bpf_reg_type */ #include /* for MAX_BPF_STACK */ + /* Just some arbitrary values so we can safely do math without overflowing and + * are obviously wrong for any sort of memory access. + */ +#define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024) +#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024) + struct bpf_reg_state { enum bpf_reg_type type; + /* + * Used to determine if any memory access using this register will + * result in a bad access. + */ + u64 min_value, max_value; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; @@ -81,6 +92,7 @@ struct bpf_verifier_env { u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; bool seen_direct_write; + bool varlen_map_value_access; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7ada3152a556..99a7e5b388f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -182,6 +182,7 @@ static const char * const reg_type_str[] = { [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", @@ -209,10 +210,17 @@ static void print_verifier_state(struct bpf_verifier_state *state) else if (t == UNKNOWN_VALUE && reg->imm) verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + t == PTR_TO_MAP_VALUE_OR_NULL || + t == PTR_TO_MAP_VALUE_ADJ) verbose("(ks=%d,vs=%d)", reg->map_ptr->key_size, reg->map_ptr->value_size); + if (reg->min_value != BPF_REGISTER_MIN_RANGE) + verbose(",min_value=%llu", + (unsigned long long)reg->min_value); + if (reg->max_value != BPF_REGISTER_MAX_RANGE) + verbose(",max_value=%llu", + (unsigned long long)reg->max_value); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -424,6 +432,8 @@ static void init_reg_state(struct bpf_reg_state *regs) for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; + regs[i].min_value = BPF_REGISTER_MIN_RANGE; + regs[i].max_value = BPF_REGISTER_MAX_RANGE; } /* frame pointer */ @@ -440,6 +450,12 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) regs[regno].imm = 0; } +static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) +{ + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -665,7 +681,7 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) static int check_ptr_alignment(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size) { - if (reg->type != PTR_TO_PACKET) { + if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { if (off % size != 0) { verbose("misaligned access off %d size %d\n", off, size); @@ -675,16 +691,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, } } - switch (env->prog->type) { - case BPF_PROG_TYPE_SCHED_CLS: - case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_XDP: - break; - default: - verbose("verifier is misconfigured\n"); - return -EACCES; - } - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) /* misaligned access to packet is ok on x86,arm,arm64 */ return 0; @@ -695,7 +701,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, } /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + if (reg->type == PTR_TO_PACKET && + (NET_IP_ALIGN + reg->off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", NET_IP_ALIGN, reg->off, off, size); return -EACCES; @@ -728,12 +735,52 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, if (err) return err; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE || + reg->type == PTR_TO_MAP_VALUE_ADJ) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); return -EACCES; } + + /* If we adjusted the register to this map value at all then we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (reg->type == PTR_TO_MAP_VALUE_ADJ) { + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if ((s64)reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, + size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + off += reg->max_value; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -1195,6 +1242,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() @@ -1416,6 +1464,106 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, return 0; } +static void check_reg_overflow(struct bpf_reg_state *reg) +{ + if (reg->max_value > BPF_REGISTER_MAX_RANGE) + reg->max_value = BPF_REGISTER_MAX_RANGE; + if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + reg->min_value = BPF_REGISTER_MIN_RANGE; +} + +static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; + bool min_set = false, max_set = false; + u8 opcode = BPF_OP(insn->code); + + dst_reg = ®s[insn->dst_reg]; + if (BPF_SRC(insn->code) == BPF_X) { + check_reg_overflow(®s[insn->src_reg]); + min_val = regs[insn->src_reg].min_value; + max_val = regs[insn->src_reg].max_value; + + /* If the source register is a random pointer then the + * min_value/max_value values represent the range of the known + * accesses into that value, not the actual min/max value of the + * register itself. In this case we have to reset the reg range + * values so we know it is not safe to look at. + */ + if (regs[insn->src_reg].type != CONST_IMM && + regs[insn->src_reg].type != UNKNOWN_VALUE) { + min_val = BPF_REGISTER_MIN_RANGE; + max_val = BPF_REGISTER_MAX_RANGE; + } + } else if (insn->imm < BPF_REGISTER_MAX_RANGE && + (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { + min_val = max_val = insn->imm; + min_set = max_set = true; + } + + /* We don't know anything about what was done to this register, mark it + * as unknown. + */ + if (min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) { + reset_reg_range_values(regs, insn->dst_reg); + return; + } + + switch (opcode) { + case BPF_ADD: + dst_reg->min_value += min_val; + dst_reg->max_value += max_val; + break; + case BPF_SUB: + dst_reg->min_value -= min_val; + dst_reg->max_value -= max_val; + break; + case BPF_MUL: + dst_reg->min_value *= min_val; + dst_reg->max_value *= max_val; + break; + case BPF_AND: + /* & is special since it could end up with 0 bits set. */ + dst_reg->min_value &= min_val; + dst_reg->max_value = max_val; + break; + case BPF_LSH: + /* Gotta have special overflow logic here, if we're shifting + * more than MAX_RANGE then just assume we have an invalid + * range. + */ + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value <<= min_val; + + if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + else + dst_reg->max_value <<= max_val; + break; + case BPF_RSH: + dst_reg->min_value >>= min_val; + dst_reg->max_value >>= max_val; + break; + case BPF_MOD: + /* % is special since it is an unsigned modulus, so the floor + * will always be 0. + */ + dst_reg->min_value = 0; + dst_reg->max_value = max_val - 1; + break; + default: + reset_reg_range_values(regs, insn->dst_reg); + break; + } + + check_reg_overflow(dst_reg); +} + /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1479,6 +1627,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + /* we are setting our register to something new, we need to + * reset its range values. + */ + reset_reg_range_values(regs, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X) { if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 @@ -1500,6 +1653,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].max_value = insn->imm; + regs[insn->dst_reg].min_value = insn->imm; } } else if (opcode > BPF_END) { @@ -1552,6 +1707,9 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg = ®s[insn->dst_reg]; + /* first we want to adjust our ranges. */ + adjust_reg_min_max_vals(env, insn); + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { @@ -1586,8 +1744,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* mark dest operand */ - mark_reg_unknown_value(regs, insn->dst_reg); + /* If we did pointer math on a map value then just set it to our + * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or + * loads to this register appropriately, otherwise just mark the + * register as unknown. + */ + if (env->allow_ptr_leaks && + (dst_reg->type == PTR_TO_MAP_VALUE || + dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) + dst_reg->type = PTR_TO_MAP_VALUE_ADJ; + else + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; @@ -1642,6 +1809,104 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, } } +/* Adjusts the register min/max values in the case that the dst_reg is the + * variable register that we are working on, and src_reg is a constant or we're + * simply doing a BPF_K check. + */ +static void reg_set_min_max(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGT: + /* If this is false then we know the maximum val is val, + * otherwise we know the min val is val+1. + */ + false_reg->max_value = val; + true_reg->min_value = val + 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then we know the maximum value is val - 1, + * otherwise we know the mimimum value is val. + */ + false_reg->max_value = val - 1; + true_reg->min_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + +/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg + * is the variable reg. + */ +static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGT: + /* + * If this is false, then the val is <= the register, if it is + * true the register <= to the val. + */ + false_reg->min_value = val; + true_reg->max_value = val - 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then constant < register, if it is true then + * the register < constant. + */ + false_reg->min_value = val + 1; + true_reg->max_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -1708,6 +1973,23 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (!other_branch) return -EFAULT; + /* detect if we are comparing against a constant value so we can adjust + * our min/max values for our dst register. + */ + if (BPF_SRC(insn->code) == BPF_X) { + if (regs[insn->src_reg].type == CONST_IMM) + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, regs[insn->src_reg].imm, + opcode); + else if (dst_reg->type == CONST_IMM) + reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + ®s[insn->src_reg], dst_reg->imm, + opcode); + } else { + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, insn->imm, opcode); + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && @@ -2144,7 +2426,8 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_state *old, +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { struct bpf_reg_state *rold, *rcur; @@ -2157,6 +2440,13 @@ static bool states_equal(struct bpf_verifier_state *old, if (memcmp(rold, rcur, sizeof(*rold)) == 0) continue; + /* If the ranges were not the same, but everything else was and + * we didn't do a variable access into a map then we are a-ok. + */ + if (!env->varlen_map_value_access && + rold->type == rcur->type && rold->imm == rcur->imm) + continue; + if (rold->type == NOT_INIT || (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) continue; @@ -2213,7 +2503,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(&sl->state, &env->cur_state)) + if (states_equal(env, &sl->state, &env->cur_state)) /* reached equivalent register/stack state, * prune the search */ @@ -2259,6 +2549,7 @@ static int do_check(struct bpf_verifier_env *env) init_reg_state(regs); insn_idx = 0; + env->varlen_map_value_access = false; for (;;) { struct bpf_insn *insn; u8 class; @@ -2339,6 +2630,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + reset_reg_range_values(regs, insn->dst_reg); if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; @@ -2509,6 +2801,7 @@ process_bpf_exit: verbose("invalid BPF_LD mode\n"); return -EINVAL; } + reset_reg_range_values(regs, insn->dst_reg); } else { verbose("unknown insn class %d\n", class); return -EINVAL; diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index 364582b77888..ac6edb61b64a 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -85,6 +85,14 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; .off = 0, \ .imm = IMM }) +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index ac590d4b7f02..369ffaad3799 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -29,6 +29,7 @@ struct bpf_test { struct bpf_insn insns[MAX_INSNS]; int fixup[MAX_FIXUPS]; int prog_array_fixup[MAX_FIXUPS]; + int test_val_map_fixup[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; enum { @@ -39,6 +40,19 @@ struct bpf_test { enum bpf_prog_type prog_type; }; +/* Note we want this to be 64 bit aligned so that the end of our array is + * actually the end of the structure. + */ +#define MAX_ENTRIES 11 +struct test_val { + unsigned index; + int foo[MAX_ENTRIES]; +}; + +struct other_val { + unsigned int action[32]; +}; + static struct bpf_test tests[] = { { "add+sub+mul", @@ -2163,6 +2177,212 @@ static struct bpf_test tests[] = { .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "valid map access into an array with a constant", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "valid map access into an array with a register", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "valid map access into an array with a variable", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, MAX_ENTRIES, 3), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "valid map access into an array with a signed variable", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES), + BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "invalid map access into an array with a constant", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, (MAX_ENTRIES + 1) << 2, + offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr = "invalid access to map value, value_size=48 off=48 size=8", + .result = REJECT, + }, + { + "invalid map access into an array with a register", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_IMM(BPF_REG_1, MAX_ENTRIES + 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr = "R0 min value is outside of the array range", + .result = REJECT, + }, + { + "invalid map access into an array with a variable", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .result = REJECT, + }, + { + "invalid map access into an array with no floor check", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES), + BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .result = REJECT, + }, + { + "invalid map access into an array with a invalid max check", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES + 1), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3}, + .errstr = "invalid access to map value, value_size=48 off=44 size=8", + .result = REJECT, + }, + { + "invalid map access into an array with a invalid max check", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct test_val, foo)), + BPF_EXIT_INSN(), + }, + .test_val_map_fixup = {3, 11}, + .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .result = REJECT, + }, }; static int probe_filter_length(struct bpf_insn *fp) @@ -2176,12 +2396,12 @@ static int probe_filter_length(struct bpf_insn *fp) return len + 1; } -static int create_map(void) +static int create_map(size_t val_size, int num) { int map_fd; map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, - sizeof(long long), sizeof(long long), 1024, 0); + sizeof(long long), val_size, num, 0); if (map_fd < 0) printf("failed to create map '%s'\n", strerror(errno)); @@ -2211,12 +2431,13 @@ static int test(void) int prog_len = probe_filter_length(prog); int *fixup = tests[i].fixup; int *prog_array_fixup = tests[i].prog_array_fixup; + int *test_val_map_fixup = tests[i].test_val_map_fixup; int expected_result; const char *expected_errstr; - int map_fd = -1, prog_array_fd = -1; + int map_fd = -1, prog_array_fd = -1, test_val_map_fd = -1; if (*fixup) { - map_fd = create_map(); + map_fd = create_map(sizeof(long long), 1024); do { prog[*fixup].imm = map_fd; @@ -2231,6 +2452,18 @@ static int test(void) prog_array_fixup++; } while (*prog_array_fixup); } + if (*test_val_map_fixup) { + /* Unprivileged can't create a hash map.*/ + if (unpriv) + continue; + test_val_map_fd = create_map(sizeof(struct test_val), + 256); + do { + prog[*test_val_map_fixup].imm = test_val_map_fd; + test_val_map_fixup++; + } while (*test_val_map_fixup); + } + printf("#%d %s ", i, tests[i].descr); prog_fd = bpf_prog_load(prog_type ?: BPF_PROG_TYPE_SOCKET_FILTER, @@ -2277,6 +2510,8 @@ fail: close(map_fd); if (prog_array_fd >= 0) close(prog_array_fd); + if (test_val_map_fd >= 0) + close(test_val_map_fd); close(prog_fd); } -- cgit v1.2.3 From a1767077b0176de17fa40ec743a20cbdac7a0d56 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:15 +0100 Subject: rxrpc: Make Tx loss-injection go through normal return and adjust tracing In rxrpc_send_data_packet() make the loss-injection path return through the same code as the transmission path so that the RTT determination is initiated and any future timer shuffling will be done, despite the packet having been binned. Whilst we're at it: (1) Add to the tx_data tracepoint an indication of whether or not we're retransmitting a data packet. (2) When we're deciding whether or not to request an ACK, rather than checking if we're in fast-retransmit mode check instead if we're retransmitting. (3) Don't invoke the lose_skb tracepoint when losing a Tx packet as we're not altering the sk_buff refcount nor are we just seeing it after getting it off the Tx list. (4) The rxrpc_skb_tx_lost note is then no longer used so remove it. (5) rxrpc_lose_skb() no longer needs to deal with rxrpc_skb_tx_lost. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 ++++-- net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_event.c | 2 +- net/rxrpc/misc.c | 1 - net/rxrpc/output.c | 17 +++++++++-------- net/rxrpc/sendmsg.c | 2 +- net/rxrpc/skbuff.c | 11 +++-------- 7 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ada12d00118c..8ba8d76e856a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -258,15 +258,16 @@ TRACE_EVENT(rxrpc_rx_ack, TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, u8 flags, bool lose), + rxrpc_serial_t serial, u8 flags, bool retrans, bool lose), - TP_ARGS(call, seq, serial, flags, lose), + TP_ARGS(call, seq, serial, flags, retrans, lose), TP_STRUCT__entry( __field(struct rxrpc_call *, call ) __field(rxrpc_seq_t, seq ) __field(rxrpc_serial_t, serial ) __field(u8, flags ) + __field(bool, retrans ) __field(bool, lose ) ), @@ -275,6 +276,7 @@ TRACE_EVENT(rxrpc_tx_data, __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; + __entry->retrans = retrans; __entry->lose = lose; ), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ca96e547cb9a..6aadaa7d8b43 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -603,7 +603,6 @@ enum rxrpc_skb_trace { rxrpc_skb_tx_cleaned, rxrpc_skb_tx_freed, rxrpc_skb_tx_got, - rxrpc_skb_tx_lost, rxrpc_skb_tx_new, rxrpc_skb_tx_rotated, rxrpc_skb_tx_seen, @@ -1073,7 +1072,7 @@ extern const s8 rxrpc_ack_priority[]; * output.c */ int rxrpc_send_call_packet(struct rxrpc_call *, u8); -int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *); +int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 0e8478012212..1f6c7633b964 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -256,7 +256,7 @@ static void rxrpc_resend(struct rxrpc_call *call) rxrpc_get_skb(skb, rxrpc_skb_tx_got); spin_unlock_bh(&call->lock); - if (rxrpc_send_data_packet(call, skb) < 0) { + if (rxrpc_send_data_packet(call, skb, true) < 0) { rxrpc_free_skb(skb, rxrpc_skb_tx_freed); return; } diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index aedb8978226d..47dddacdbb91 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -108,7 +108,6 @@ const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { [rxrpc_skb_tx_cleaned] = "Tx CLN", [rxrpc_skb_tx_freed] = "Tx FRE", [rxrpc_skb_tx_got] = "Tx GOT", - [rxrpc_skb_tx_lost] = "Tx *L*", [rxrpc_skb_tx_new] = "Tx NEW", [rxrpc_skb_tx_rotated] = "Tx ROT", [rxrpc_skb_tx_seen] = "Tx SEE", diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index cf43a715685e..ac9a58b619a6 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -238,7 +238,8 @@ out: /* * send a packet through the transport endpoint */ -int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) +int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool retrans) { struct rxrpc_connection *conn = call->conn; struct rxrpc_wire_header whdr; @@ -247,6 +248,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) struct kvec iov[2]; rxrpc_serial_t serial; size_t len; + bool lost = false; int ret, opt; _enter(",{%d}", skb->len); @@ -281,7 +283,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. */ - if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT || + if (retrans || (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) @@ -290,11 +292,9 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, - whdr.flags, true); - rxrpc_lose_skb(skb, rxrpc_skb_tx_lost); - _leave(" = 0 [lose]"); - return 0; + ret = 0; + lost = true; + goto done; } } @@ -319,7 +319,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) goto send_fragmentable; done: - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false); + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, + retrans, lost); if (ret >= 0) { ktime_t now = ktime_get_real(); skb->tstamp = now; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 1f8040d82395..d8dfdce874d8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -144,7 +144,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (seq == 1 && rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); - ret = rxrpc_send_data_packet(call, skb); + ret = rxrpc_send_data_packet(call, skb, false); if (ret < 0) { _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 5154cbf7e540..67b02c45271b 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -77,14 +77,9 @@ void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) if (skb) { int n; CHECK_SLAB_OKAY(&skb->users); - if (op == rxrpc_skb_tx_lost) { - n = atomic_read(select_skb_count(op)); - trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); - } else { - n = atomic_dec_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); - kfree_skb(skb); - } + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + kfree_skb(skb); } } -- cgit v1.2.3 From 6348ef2dbbd96c4488a1ee83cc0f0f3d9a314a2f Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:28:58 +0800 Subject: net:snmp: Introduce generic interfaces for snmp_get_cpu_field{, 64} This is to introduce the generic interfaces for snmp_get_cpu_field{,64}. It exchanges the two for-loops for collecting the percpu statistics data. This can aggregate the data by going through all the items of each cpu sequentially. Signed-off-by: Jia He Suggested-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/ip.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 9742b92dc933..bc43c0fcae12 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -219,6 +219,29 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } #endif +#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; stats_list[i].name; i++) \ + buff64[i] += snmp_get_cpu_field64( \ + mib_statistic, \ + c, stats_list[i].entry, \ + offset); \ + } \ +} + +#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; stats_list[i].name; i++) \ + buff[i] += snmp_get_cpu_field( \ + mib_statistic, \ + c, stats_list[i].entry); \ + } \ +} + void inet_get_local_port_range(struct net *net, int *low, int *high); #ifdef CONFIG_SYSCTL -- cgit v1.2.3 From bd11f0741fa5a2c296629898ad07759dd12b35bb Mon Sep 17 00:00:00 2001 From: Maciej Å»enczykowski Date: Tue, 27 Sep 2016 23:57:58 -0700 Subject: ipv6 addrconf: implement RFC7559 router solicitation backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements: https://tools.ietf.org/html/rfc7559 Backoff is performed according to RFC3315 section 14: https://tools.ietf.org/html/rfc3315#section-14 We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations to a negative value meaning an unlimited number of retransmits, and we make this the new default (inline with the RFC). We also add a new setting: /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval defaulting to 1 hour (per RFC recommendation). Signed-off-by: Maciej Å»enczykowski Acked-by: Erik Kline Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/net/addrconf.h | 3 ++- include/net/if_inet6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 51 ++++++++++++++++++++++++++++++++++++++++------- 5 files changed, 49 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c6dbcd84a2c7..7e9a789be5e0 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -18,6 +18,7 @@ struct ipv6_devconf { __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; + __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9826d3a9464c..f2d072787947 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -1,8 +1,9 @@ #ifndef _ADDRCONF_H #define _ADDRCONF_H -#define MAX_RTR_SOLICITATIONS 3 +#define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) +#define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 1c8b6820b694..515352c6280a 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -201,6 +201,7 @@ struct inet6_dev { struct ipv6_devstat stats; struct timer_list rs_timer; + __s32 rs_interval; /* in jiffies */ __u8 rs_probes; __u8 addr_gen_mode; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 395876060f50..8c2772340c3f 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -177,6 +177,7 @@ enum { DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, DEVCONF_DROP_UNSOLICITED_NA, DEVCONF_KEEP_ADDR_ON_DOWN, + DEVCONF_RTR_SOLICIT_MAX_INTERVAL, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 35d4baa55c9d..87183983724d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp) return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } +static inline s32 rfc3315_s14_backoff_init(s32 irt) +{ + /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ + u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt; + do_div(tmp, 1000000); + return (s32)tmp; +} + +static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) +{ + /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ + u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt; + do_div(tmp, 1000000); + if ((s32)tmp > mrt) { + /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ + tmp = (900000 + prandom_u32() % 200001) * (u64)mrt; + do_div(tmp, 1000000); + } + return (s32)tmp; +} + #ifdef CONFIG_SYSCTL static int addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); @@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -232,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -3687,7 +3710,7 @@ static void addrconf_rs_timer(unsigned long data) if (idev->if_flags & IF_RA_RCVD) goto out; - if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) { write_unlock(&idev->lock); if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) ndisc_send_rs(dev, &lladdr, @@ -3696,11 +3719,13 @@ static void addrconf_rs_timer(unsigned long data) goto put; write_lock(&idev->lock); + idev->rs_interval = rfc3315_s14_backoff_update( + idev->rs_interval, idev->cnf.rtr_solicit_max_interval); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == idev->cnf.rtr_solicits) ? idev->cnf.rtr_solicit_delay : - idev->cnf.rtr_solicit_interval); + idev->rs_interval); } else { /* * Note: we do not support deprecated "all on-link" @@ -3949,7 +3974,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && - ifp->idev->cnf.rtr_solicits > 0 && + ifp->idev->cnf.rtr_solicits != 0 && (dev->flags&IFF_LOOPBACK) == 0; read_unlock_bh(&ifp->idev->lock); @@ -3971,10 +3996,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) write_lock_bh(&ifp->idev->lock); spin_lock(&ifp->lock); + ifp->idev->rs_interval = rfc3315_s14_backoff_init( + ifp->idev->cnf.rtr_solicit_interval); ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_rs_timer(ifp->idev, - ifp->idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval); spin_unlock(&ifp->lock); write_unlock_bh(&ifp->idev->lock); } @@ -4891,6 +4917,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; array[DEVCONF_RTR_SOLICIT_INTERVAL] = jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_max_interval); array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; @@ -5099,7 +5127,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) return -EINVAL; if (!ipv6_accept_ra(idev)) return -EINVAL; - if (idev->cnf.rtr_solicits <= 0) + if (idev->cnf.rtr_solicits == 0) return -EINVAL; write_lock_bh(&idev->lock); @@ -5128,8 +5156,10 @@ update_lft: if (update_rs) { idev->if_flags |= IF_RS_SENT; + idev->rs_interval = rfc3315_s14_backoff_init( + idev->cnf.rtr_solicit_interval); idev->rs_probes = 1; - addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(idev, idev->rs_interval); } /* Well, that's kinda nasty ... */ @@ -5777,6 +5807,13 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "router_solicitation_max_interval", + .data = &ipv6_devconf.rtr_solicit_max_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { .procname = "router_solicitation_delay", .data = &ipv6_devconf.rtr_solicit_delay, -- cgit v1.2.3 From 265a44bbf23e4ed1c76409f07595ea88351ba4b3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Sep 2016 08:50:42 +0100 Subject: rxrpc: Actually display the tx_data trace retransmission note Actually display in the tx_data trace the retransmission note added in a previous patch. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 8ba8d76e856a..67f03946ea4a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -280,11 +280,12 @@ TRACE_EVENT(rxrpc_tx_data, __entry->lose = lose; ), - TP_printk("c=%p DATA %08x q=%08x fl=%02x%s", + TP_printk("c=%p DATA %08x q=%08x fl=%02x%s%s", __entry->call, __entry->serial, __entry->seq, __entry->flags, + __entry->retrans ? " *RETRANS*" : "", __entry->lose ? " *LOSE*" : "") ); -- cgit v1.2.3 From b8676221f00dd5b6018f0fd88cd278f93e11143a Mon Sep 17 00:00:00 2001 From: David Spinadel Date: Thu, 22 Sep 2016 23:16:50 +0300 Subject: cfg80211: Add support for static WEP in the driver Add support for drivers that implement static WEP internally, i.e. expose connection keys to the driver in connect flow and don't upload the keys after the connection. Signed-off-by: David Spinadel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 +++++++++++- net/wireless/core.h | 4 ++-- net/wireless/ibss.c | 5 +++-- net/wireless/sme.c | 6 +++++- net/wireless/util.c | 2 +- net/wireless/wext-compat.c | 2 +- net/wireless/wext-sme.c | 2 +- 7 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ed37304fa09d..68dca3d93b85 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5,7 +5,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -593,6 +593,8 @@ struct survey_info { s8 noise; }; +#define CFG80211_MAX_WEP_KEYS 4 + /** * struct cfg80211_crypto_settings - Crypto settings * @wpa_versions: indicates which, if any, WPA versions are enabled @@ -610,6 +612,9 @@ struct survey_info { * allowed through even on unauthorized ports * @control_port_no_encrypt: TRUE to prevent encryption of control port * protocol frames. + * @wep_keys: static WEP keys, if not NULL points to an array of + * CFG80211_MAX_WEP_KEYS WEP keys + * @wep_tx_key: key index (0..3) of the default TX static WEP key */ struct cfg80211_crypto_settings { u32 wpa_versions; @@ -621,6 +626,8 @@ struct cfg80211_crypto_settings { bool control_port; __be16 control_port_ethertype; bool control_port_no_encrypt; + struct key_params *wep_keys; + int wep_tx_key; }; /** @@ -2905,6 +2912,8 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). + * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation + * before connection. */ enum wiphy_flags { /* use hole at 0 */ @@ -2930,6 +2939,7 @@ enum wiphy_flags { WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL = BIT(21), WIPHY_FLAG_SUPPORTS_5_10_MHZ = BIT(22), WIPHY_FLAG_HAS_CHANNEL_SWITCH = BIT(23), + WIPHY_FLAG_HAS_STATIC_WEP = BIT(24), }; /** diff --git a/net/wireless/core.h b/net/wireless/core.h index 5555e3c13ae9..554f87d0f991 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -249,8 +249,8 @@ struct cfg80211_event { }; struct cfg80211_cached_keys { - struct key_params params[4]; - u8 data[4][WLAN_KEY_LEN_WEP104]; + struct key_params params[CFG80211_MAX_WEP_KEYS]; + u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104]; int def; }; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index eafdfa5798ae..364f900a3dc4 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -43,7 +43,8 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, cfg80211_hold_bss(bss_from_pub(bss)); wdev->current_bss = bss_from_pub(bss); - cfg80211_upload_connect_keys(wdev); + if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) + cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); @@ -296,7 +297,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < 4; i++) + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index c08a3b57dca1..a77db333927e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -726,7 +726,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, wdev->current_bss = bss_from_pub(bss); - cfg80211_upload_connect_keys(wdev); + if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) + cfg80211_upload_connect_keys(wdev); rcu_read_lock(); country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY); @@ -1043,6 +1044,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, connect->crypto.ciphers_pairwise[0] = cipher; } } + + connect->crypto.wep_keys = connkeys->params; + connect->crypto.wep_tx_key = connkeys->def; } else { if (WARN_ON(connkeys)) return -EINVAL; diff --git a/net/wireless/util.c b/net/wireless/util.c index 9e6e2aaa7766..e02141d66b69 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -912,7 +912,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) if (!wdev->connect_keys) return; - for (i = 0; i < 4; i++) { + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, i, false, NULL, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 2b096c02eb85..a220156cf217 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -415,7 +415,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; - for (i = 0; i < 4; i++) + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 88f1f6931ab8..995163830a61 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -46,7 +46,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < 4; i++) + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) ck->params[i].key = ck->data[i]; } -- cgit v1.2.3 From cb3b7d87652aeb37cfb5295a6157a3280dae10cb Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:13 +0300 Subject: cfg80211: add start / stop NAN commands This allows user space to start/stop NAN interface. A NAN interface is like P2P device in a few aspects: it doesn't have a netdev associated to it. Add the new interface type and prevent operations that can't be executed on NAN interface like scan. Define several attributes that may be configured by user space when starting NAN functionality (master preference and dual band operation) Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 21 +++++++++- include/uapi/linux/nl80211.h | 47 +++++++++++++++++++++++ net/mac80211/cfg.c | 2 + net/mac80211/chan.c | 3 ++ net/mac80211/iface.c | 4 ++ net/mac80211/offchannel.c | 1 + net/mac80211/rx.c | 3 ++ net/mac80211/util.c | 1 + net/wireless/chan.c | 2 + net/wireless/core.c | 34 +++++++++++++++++ net/wireless/core.h | 3 ++ net/wireless/mlme.c | 1 + net/wireless/nl80211.c | 91 ++++++++++++++++++++++++++++++++++++++++++-- net/wireless/rdev-ops.h | 20 ++++++++++ net/wireless/trace.h | 27 +++++++++++++ net/wireless/util.c | 6 ++- 16 files changed, 260 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 68dca3d93b85..9898e1f883e2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2313,6 +2313,19 @@ struct cfg80211_qos_map { struct cfg80211_dscp_range up[8]; }; +/** + * struct cfg80211_nan_conf - NAN configuration + * + * This struct defines NAN configuration parameters + * + * @master_pref: master preference (1 - 255) + * @dual: dual band operation mode, see &enum nl80211_nan_dual_band_conf + */ +struct cfg80211_nan_conf { + u8 master_pref; + u8 dual; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -2601,6 +2614,8 @@ struct cfg80211_qos_map { * and returning to the base channel for communication with the AP. * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both * peers must be on the base channel when the call completes. + * @start_nan: Start the NAN interface. + * @stop_nan: Stop the NAN interface. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2866,6 +2881,9 @@ struct cfg80211_ops { void (*tdls_cancel_channel_switch)(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); + int (*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf); + void (*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev); }; /* @@ -3626,6 +3644,7 @@ struct cfg80211_cached_keys; * beacons, 0 when not valid * @address: The address for this device, valid only if @netdev is %NULL * @p2p_started: true if this is a P2P Device that has been started + * @nan_started: true if this is a NAN interface that has been started * @cac_started: true if DFS channel availability check has been started * @cac_start_time: timestamp (jiffies) when the dfs state was entered. * @cac_time_ms: CAC time in ms @@ -3657,7 +3676,7 @@ struct wireless_dev { struct mutex mtx; - bool use_4addr, p2p_started; + bool use_4addr, p2p_started, nan_started; u8 address[ETH_ALEN] __aligned(sizeof(u16)); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ec10d1b2838f..98fd3ec8598d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -838,6 +838,16 @@ * not running. The driver indicates the status of the scan through * cfg80211_scan_done(). * + * @NL80211_CMD_START_NAN: Start NAN operation, identified by its + * %NL80211_ATTR_WDEV interface. This interface must have been previously + * created with %NL80211_CMD_NEW_INTERFACE. After it has been started, the + * NAN interface will create or join a cluster. This command must have a + * valid %NL80211_ATTR_NAN_MASTER_PREF attribute and optional + * %NL80211_ATTR_NAN_DUAL attributes. + * After this command NAN functions can be added. + * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by + * its %NL80211_ATTR_WDEV interface. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1026,6 +1036,9 @@ enum nl80211_commands { NL80211_CMD_ABORT_SCAN, + NL80211_CMD_START_NAN, + NL80211_CMD_STOP_NAN, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1739,6 +1752,12 @@ enum nl80211_commands { * regulatory indoor configuration would be owned by the netlink socket * that configured the indoor setting, and the indoor operation would be * cleared when the socket is closed. + * If set during NAN interface creation, the interface will be destroyed + * if the socket is closed just like any other interface. Moreover, only + * the netlink socket that created the interface will be allowed to add + * and remove functions. NAN notifications will be sent in unicast to that + * socket. Without this attribute, any socket can add functions and the + * notifications will be sent to the %NL80211_MCGRP_NAN multicast group. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. @@ -1873,6 +1892,14 @@ enum nl80211_commands { * @NL80211_ATTR_MESH_PEER_AID: Association ID for the mesh peer (u16). This is * used to pull the stored data for mesh peer in power save state. * + * @NL80211_ATTR_NAN_MASTER_PREF: the master preference to be used by + * %NL80211_CMD_START_NAN. Its type is u8 and it can't be 0. + * Also, values 1 and 255 are reserved for certification purposes and + * should not be used during a normal device operation. + * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see + * &enum nl80211_nan_dual_band_conf). This attribute is used with + * %NL80211_CMD_START_NAN. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2267,6 +2294,9 @@ enum nl80211_attrs { NL80211_ATTR_MESH_PEER_AID, + NL80211_ATTR_NAN_MASTER_PREF, + NL80211_ATTR_NAN_DUAL, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -2345,6 +2375,7 @@ enum nl80211_attrs { * commands to create and destroy one * @NL80211_IF_TYPE_OCB: Outside Context of a BSS * This mode corresponds to the MIB variable dot11OCBActivated=true + * @NL80211_IFTYPE_NAN: NAN device interface type (not a netdev) * @NL80211_IFTYPE_MAX: highest interface type number currently defined * @NUM_NL80211_IFTYPES: number of defined interface types * @@ -2365,6 +2396,7 @@ enum nl80211_iftype { NL80211_IFTYPE_P2P_GO, NL80211_IFTYPE_P2P_DEVICE, NL80211_IFTYPE_OCB, + NL80211_IFTYPE_NAN, /* keep last */ NUM_NL80211_IFTYPES, @@ -4870,4 +4902,19 @@ enum nl80211_bss_select_attr { NL80211_BSS_SELECT_ATTR_MAX = __NL80211_BSS_SELECT_ATTR_AFTER_LAST - 1 }; +/** + * enum nl80211_nan_dual_band_conf - NAN dual band configuration + * + * Defines the NAN dual band mode of operation + * + * @NL80211_NAN_BAND_DEFAULT: device default mode + * @NL80211_NAN_BAND_2GHZ: 2.4GHz mode + * @NL80211_NAN_BAND_5GHZ: 5GHz mode + */ +enum nl80211_nan_dual_band_conf { + NL80211_NAN_BAND_DEFAULT = 1 << 0, + NL80211_NAN_BAND_2GHZ = 1 << 1, + NL80211_NAN_BAND_5GHZ = 1 << 2, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e29ff5749944..a74027f887bc 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -257,6 +257,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: @@ -2036,6 +2037,7 @@ static int ieee80211_scan(struct wiphy *wiphy, !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 74142d07ad31..d035801569eb 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -274,6 +274,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, ieee80211_get_max_required_bw(sdata)); break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_WDS: @@ -718,6 +719,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, switch (sdata->vif.type) { case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_STATION: if (!sdata->u.mgd.associated) @@ -980,6 +982,7 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata) case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: case NUM_NL80211_IFTYPES: WARN_ON(1); break; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b0abddc714ef..e694ca2baad0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -545,6 +545,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_OCB: + case NL80211_IFTYPE_NAN: /* no special treatment */ break; case NL80211_IFTYPE_UNSPECIFIED: @@ -660,6 +661,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: break; default: /* not reached */ @@ -948,6 +950,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, /* relies on synchronize_rcu() below */ RCU_INIT_POINTER(local->p2p_sdata, NULL); /* fall through */ + case NL80211_IFTYPE_NAN: default: cancel_work_sync(&sdata->work); /* @@ -1457,6 +1460,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 55a9c5b94ce1..75d5c960ce67 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -838,6 +838,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e796060b7c5e..c9489a86e6d6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3586,6 +3586,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) ieee80211_is_probe_req(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control) || ieee80211_is_beacon(hdr->frame_control); + case NL80211_IFTYPE_NAN: + /* Currently no frames on NAN interface are allowed */ + return false; default: break; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b6865d884487..2c78541f695c 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1975,6 +1975,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 0f506220a3bd..5497d022fada 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -372,6 +372,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: @@ -946,6 +947,7 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: /* these interface types don't really have a channel */ return; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/wireless/core.c b/net/wireless/core.c index 4911cd997b9a..013987243c0b 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -225,6 +225,23 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, } } +void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + ASSERT_RTNL(); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) + return; + + if (!wdev->nan_started) + return; + + rdev_stop_nan(rdev, wdev); + wdev->nan_started = false; + + rdev->opencount--; +} + void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -242,6 +259,9 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; + case NL80211_IFTYPE_NAN: + cfg80211_stop_nan(rdev, wdev); + break; default: break; } @@ -537,6 +557,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) c->limits[j].max > 1)) return -EINVAL; + /* Only a single NAN can be allowed */ + if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && + c->limits[j].max > 1)) + return -EINVAL; + cnt += c->limits[j].max; /* * Don't advertise an unsupported type @@ -579,6 +604,10 @@ int wiphy_register(struct wiphy *wiphy) !rdev->ops->tdls_cancel_channel_switch))) return -EINVAL; + if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && + (!rdev->ops->start_nan || !rdev->ops->stop_nan))) + return -EINVAL; + /* * if a wiphy has unsupported modes for regulatory channel enforcement, * opt-out of enforcement checking @@ -589,6 +618,7 @@ int wiphy_register(struct wiphy *wiphy) BIT(NL80211_IFTYPE_P2P_GO) | BIT(NL80211_IFTYPE_ADHOC) | BIT(NL80211_IFTYPE_P2P_DEVICE) | + BIT(NL80211_IFTYPE_NAN) | BIT(NL80211_IFTYPE_AP_VLAN) | BIT(NL80211_IFTYPE_MONITOR))) wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF; @@ -916,6 +946,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) cfg80211_mlme_purge_registrations(wdev); cfg80211_stop_p2p_device(rdev, wdev); break; + case NL80211_IFTYPE_NAN: + cfg80211_stop_nan(rdev, wdev); + break; default: WARN_ON_ONCE(1); break; @@ -979,6 +1012,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, /* must be handled by mac80211/driver, has no APIs */ break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ break; case NL80211_IFTYPE_AP_VLAN: diff --git a/net/wireless/core.h b/net/wireless/core.h index 554f87d0f991..08d2e948c9ad 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -488,6 +488,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + #define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index d6abb0704db5..cbb48e26a871 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -634,6 +634,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, * fall through, P2P device only supports * public action frames */ + case NL80211_IFTYPE_NAN: default: err = -EOPNOTSUPP; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b8441e60b0f6..9e9fb37087fc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -409,6 +409,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = VHT_MUMIMO_GROUPS_DATA_LEN }, [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, + [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, + [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -934,6 +936,7 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: @@ -2819,7 +2822,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) !(rdev->wiphy.interface_modes & (1 << type))) return -EOPNOTSUPP; - if ((type == NL80211_IFTYPE_P2P_DEVICE || + if ((type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN || rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) && info->attrs[NL80211_ATTR_MAC]) { nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC], @@ -2875,9 +2878,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) wdev->mesh_id_up_len); wdev_unlock(wdev); break; + case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_P2P_DEVICE: /* - * P2P Device doesn't have a netdev, so doesn't go + * P2P Device and NAN do not have a netdev, so don't go * through the netdev notifier and must be added here */ mutex_init(&wdev->mtx); @@ -6434,6 +6438,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) wiphy = &rdev->wiphy; + if (wdev->iftype == NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + if (!rdev->ops->scan) return -EOPNOTSUPP; @@ -8977,6 +8984,7 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -9022,6 +9030,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_P2P_GO: break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -9138,6 +9147,7 @@ static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *in case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -10504,6 +10514,58 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct cfg80211_nan_conf conf = {}; + int err; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (wdev->nan_started) + return -EEXIST; + + if (rfkill_blocked(rdev->rfkill)) + return -ERFKILL; + + if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_DUAL]) + return -EINVAL; + + conf.master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + if (!conf.master_pref) + return -EINVAL; + + conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); + + err = rdev_start_nan(rdev, wdev, &conf); + if (err) + return err; + + wdev->nan_started = true; + rdev->opencount++; + + return 0; +} + +static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + cfg80211_stop_nan(rdev, wdev); + + return 0; +} + static int nl80211_get_protocol_features(struct sk_buff *skb, struct genl_info *info) { @@ -11205,7 +11267,14 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, dev_hold(dev); } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) { - if (!wdev->p2p_started) { + if (wdev->iftype == NL80211_IFTYPE_P2P_DEVICE && + !wdev->p2p_started) { + if (rtnl) + rtnl_unlock(); + return -ENETDOWN; + } + if (wdev->iftype == NL80211_IFTYPE_NAN && + !wdev->nan_started) { if (rtnl) rtnl_unlock(); return -ENETDOWN; @@ -11838,6 +11907,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_START_NAN, + .doit = nl80211_start_nan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_STOP_NAN, + .doit = nl80211_stop_nan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, { .cmd = NL80211_CMD_SET_MCAST_RATE, .doit = nl80211_set_mcast_rate, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 85ff30bee2b9..afb68a8428b9 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -887,6 +887,26 @@ static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf) +{ + int ret; + + trace_rdev_start_nan(&rdev->wiphy, wdev, conf); + ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + trace_rdev_stop_nan(&rdev->wiphy, wdev); + rdev->ops->stop_nan(&rdev->wiphy, wdev); + trace_rdev_return_void(&rdev->wiphy); +} + static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 72b5255cefe2..5f3370f4c6a2 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1889,6 +1889,33 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device, TP_ARGS(wiphy, wdev) ); +TRACE_EVENT(rdev_start_nan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf), + TP_ARGS(wiphy, wdev, conf), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u8, master_pref) + __field(u8, dual); + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", master preference: %u, dual: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, + __entry->dual) +); + +DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev) +); + TRACE_EVENT(rdev_set_mac_acl, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_acl_data *params), diff --git a/net/wireless/util.c b/net/wireless/util.c index e02141d66b69..7a2d46b0058a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1008,8 +1008,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; - /* cannot change into P2P device type */ - if (ntype == NL80211_IFTYPE_P2P_DEVICE) + /* cannot change into P2P device or NAN */ + if (ntype == NL80211_IFTYPE_P2P_DEVICE || + ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || @@ -1088,6 +1089,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: WARN_ON(1); break; } -- cgit v1.2.3 From 708d50edb149fe488c7c96f59ba9a89a64985cf2 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:14 +0300 Subject: mac80211: add boilerplate code for start / stop NAN This code doesn't do much besides allowing to start and stop the vif. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Ayala Beker Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 9 +++++++++ net/mac80211/cfg.c | 36 ++++++++++++++++++++++++++++++++++ net/mac80211/chan.c | 3 +++ net/mac80211/driver-ops.h | 27 +++++++++++++++++++++++++ net/mac80211/iface.c | 8 ++++++-- net/mac80211/main.c | 5 +++++ net/mac80211/offchannel.c | 3 ++- net/mac80211/trace.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/util.c | 3 ++- 9 files changed, 140 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5296100f3889..df9b5cff300c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3420,6 +3420,9 @@ enum ieee80211_reconfig_type { * synchronization which is needed in case driver has in its RSS queues * pending frames that were received prior to the control path action * currently taken (e.g. disassociation) but are not processed yet. + * + * @start_nan: join an existing NAN cluster, or create a new one. + * @stop_nan: leave the NAN cluster. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -3655,6 +3658,12 @@ struct ieee80211_ops { void (*wake_tx_queue)(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void (*sync_rx_queues)(struct ieee80211_hw *hw); + + int (*start_nan)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct cfg80211_nan_conf *conf); + int (*stop_nan)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a74027f887bc..9aabb0932d24 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright (C) 2015-2016 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -152,6 +153,39 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy, ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } +static int ieee80211_start_nan(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + int ret; + + mutex_lock(&sdata->local->chanctx_mtx); + ret = ieee80211_check_combinations(sdata, NULL, 0, 0); + mutex_unlock(&sdata->local->chanctx_mtx); + if (ret < 0) + return ret; + + ret = ieee80211_do_open(wdev, true); + if (ret) + return ret; + + ret = drv_start_nan(sdata->local, sdata, conf); + if (ret) + ieee80211_sdata_stop(sdata); + + return ret; +} + +static void ieee80211_stop_nan(struct wiphy *wiphy, + struct wireless_dev *wdev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + drv_stop_nan(sdata->local, sdata); + ieee80211_sdata_stop(sdata); +} + static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) @@ -3464,4 +3498,6 @@ const struct cfg80211_ops mac80211_config_ops = { .set_ap_chanwidth = ieee80211_set_ap_chanwidth, .add_tx_ts = ieee80211_add_tx_ts, .del_tx_ts = ieee80211_del_tx_ts, + .start_nan = ieee80211_start_nan, + .stop_nan = ieee80211_stop_nan, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index d035801569eb..e75cbf6ecc26 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -647,6 +647,9 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *curr_ctx = NULL; int ret = 0; + if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) + return -ENOTSUPP; + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index fe35a1c0dc86..e52cfb855bd9 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -162,6 +162,7 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || + sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !sdata->vif.mu_mimo_owner))) return; @@ -1165,4 +1166,30 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, local->ops->wake_tx_queue(&local->hw, &txq->txq); } +static inline int drv_start_nan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf) +{ + int ret; + + might_sleep(); + check_sdata_in_driver(sdata); + + trace_drv_start_nan(local, sdata, conf); + ret = local->ops->start_nan(&local->hw, &sdata->vif, conf); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_stop_nan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + check_sdata_in_driver(sdata); + + trace_drv_stop_nan(local, sdata); + local->ops->stop_nan(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e694ca2baad0..507f46a8eb1c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -327,6 +327,9 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, int n_queues = sdata->local->hw.queues; int i; + if (iftype == NL80211_IFTYPE_NAN) + return 0; + if (iftype != NL80211_IFTYPE_P2P_DEVICE) { for (i = 0; i < IEEE80211_NUM_ACS; i++) { if (WARN_ON_ONCE(sdata->vif.hw_queue[i] == @@ -647,7 +650,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) local->fif_probe_req++; } - if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) changed |= ieee80211_reset_erp_info(sdata); ieee80211_bss_info_change_notify(sdata, changed); @@ -1726,7 +1730,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ASSERT_RTNL(); - if (type == NL80211_IFTYPE_P2P_DEVICE) { + if (type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN) { struct wireless_dev *wdev; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ac053a9df36d..b5cf2c5cc166 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -821,6 +821,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) !local->ops->tdls_recv_channel_switch)) return -EOPNOTSUPP; + if (WARN_ON(local->hw.wiphy->interface_modes & + BIT(NL80211_IFTYPE_NAN) && + (!local->ops->start_nan || !local->ops->stop_nan))) + return -EINVAL; + #ifdef CONFIG_PM if (hw->wiphy->wowlan && (!local->ops->suspend || !local->ops->resume)) return -EINVAL; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 75d5c960ce67..c3f610bba3fe 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -128,7 +128,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) if (!ieee80211_sdata_running(sdata)) continue; - if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) + if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || + sdata->vif.type == NL80211_IFTYPE_NAN) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 77e4c53baefb..deefbfb9f6fb 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1700,6 +1700,56 @@ TRACE_EVENT(drv_get_expected_throughput, ) ); +TRACE_EVENT(drv_start_nan, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf), + + TP_ARGS(local, sdata, conf), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, master_pref) + __field(u8, dual) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", master preference: %u, dual: %d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, + __entry->dual + ) +); + +TRACE_EVENT(drv_stop_nan, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + + TP_ARGS(local, sdata), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG + ) +); + /* * Tracing for API calls that drivers call. */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 2c78541f695c..5b57fcaaec9b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1209,7 +1209,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && - sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) { + sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) { sdata->vif.bss_conf.qos = enable_qos; if (bss_notify) ieee80211_bss_info_change_notify(sdata, -- cgit v1.2.3 From a442b761b24b6886f9a4e2ff5f8cb4824c96526b Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:15 +0300 Subject: cfg80211: add add_nan_func / del_nan_func A NAN function can be either publish, subscribe or follow up. Make all the necessary verifications and just pass the request to the driver. Allow the user space application that starts NAN to forbid any other socket to add or remove functions. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Ayala Beker Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 91 +++++++++++ include/uapi/linux/nl80211.h | 150 ++++++++++++++++++ net/wireless/core.c | 3 +- net/wireless/nl80211.c | 369 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 21 +++ net/wireless/trace.h | 39 +++++ net/wireless/util.c | 22 +++ 7 files changed, 694 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9898e1f883e2..2f35ccf6da83 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2326,6 +2326,73 @@ struct cfg80211_nan_conf { u8 dual; }; +/** + * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter + * + * @filter: the content of the filter + * @len: the length of the filter + */ +struct cfg80211_nan_func_filter { + const u8 *filter; + u8 len; +}; + +/** + * struct cfg80211_nan_func - a NAN function + * + * @type: &enum nl80211_nan_function_type + * @service_id: the service ID of the function + * @publish_type: &nl80211_nan_publish_type + * @close_range: if true, the range should be limited. Threshold is + * implementation specific. + * @publish_bcast: if true, the solicited publish should be broadcasted + * @subscribe_active: if true, the subscribe is active + * @followup_id: the instance ID for follow up + * @followup_reqid: the requestor instance ID for follow up + * @followup_dest: MAC address of the recipient of the follow up + * @ttl: time to live counter in DW. + * @serv_spec_info: Service Specific Info + * @serv_spec_info_len: Service Specific Info length + * @srf_include: if true, SRF is inclusive + * @srf_bf: Bloom Filter + * @srf_bf_len: Bloom Filter length + * @srf_bf_idx: Bloom Filter index + * @srf_macs: SRF MAC addresses + * @srf_num_macs: number of MAC addresses in SRF + * @rx_filters: rx filters that are matched with corresponding peer's tx_filter + * @tx_filters: filters that should be transmitted in the SDF. + * @num_rx_filters: length of &rx_filters. + * @num_tx_filters: length of &tx_filters. + * @instance_id: driver allocated id of the function. + * @cookie: unique NAN function identifier. + */ +struct cfg80211_nan_func { + enum nl80211_nan_function_type type; + u8 service_id[NL80211_NAN_FUNC_SERVICE_ID_LEN]; + u8 publish_type; + bool close_range; + bool publish_bcast; + bool subscribe_active; + u8 followup_id; + u8 followup_reqid; + struct mac_address followup_dest; + u32 ttl; + const u8 *serv_spec_info; + u8 serv_spec_info_len; + bool srf_include; + const u8 *srf_bf; + u8 srf_bf_len; + u8 srf_bf_idx; + struct mac_address *srf_macs; + int srf_num_macs; + struct cfg80211_nan_func_filter *rx_filters; + struct cfg80211_nan_func_filter *tx_filters; + u8 num_tx_filters; + u8 num_rx_filters; + u8 instance_id; + u64 cookie; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -2616,6 +2683,14 @@ struct cfg80211_nan_conf { * peers must be on the base channel when the call completes. * @start_nan: Start the NAN interface. * @stop_nan: Stop the NAN interface. + * @add_nan_func: Add a NAN function. Returns negative value on failure. + * On success @nan_func ownership is transferred to the driver and + * it may access it outside of the scope of this function. The driver + * should free the @nan_func when no longer needed by calling + * cfg80211_free_nan_func(). + * On success the driver should assign an instance_id in the + * provided @nan_func. + * @del_nan_func: Delete a NAN function. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2884,6 +2959,10 @@ struct cfg80211_ops { int (*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf); void (*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev); + int (*add_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_nan_func *nan_func); + void (*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev, + u64 cookie); }; /* @@ -3335,6 +3414,8 @@ struct wiphy_iftype_ext_capab { * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. + * + * @cookie_counter: unique generic cookie counter, used to identify objects. */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -3464,6 +3545,8 @@ struct wiphy { u32 bss_select_support; + u64 cookie_counter; + char priv[0] __aligned(NETDEV_ALIGN); }; @@ -5584,6 +5667,14 @@ wiphy_ext_feature_isset(struct wiphy *wiphy, return (ft_byte & BIT(ftidx % 8)) != 0; } +/** + * cfg80211_free_nan_func - free NAN function + * @f: NAN function that should be freed + * + * Frees all the NAN function and all it's allocated members. + */ +void cfg80211_free_nan_func(struct cfg80211_nan_func *f); + /* ethtool helper */ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 98fd3ec8598d..e4935d963061 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -847,6 +847,21 @@ * After this command NAN functions can be added. * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by * its %NL80211_ATTR_WDEV interface. + * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined + * with %NL80211_ATTR_NAN_FUNC nested attribute. When called, this + * operation returns the strictly positive and unique instance id + * (%NL80211_ATTR_NAN_FUNC_INST_ID) and a cookie (%NL80211_ATTR_COOKIE) + * of the function upon success. + * Since instance ID's can be re-used, this cookie is the right + * way to identify the function. This will avoid races when a termination + * event is handled by the user space after it has already added a new + * function that got the same instance id from the kernel as the one + * which just terminated. + * This cookie may be used in NAN events even before the command + * returns, so userspace shouldn't process NAN events until it processes + * the response to this command. + * Look at %NL80211_ATTR_SOCKET_OWNER as well. + * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1038,6 +1053,8 @@ enum nl80211_commands { NL80211_CMD_START_NAN, NL80211_CMD_STOP_NAN, + NL80211_CMD_ADD_NAN_FUNCTION, + NL80211_CMD_DEL_NAN_FUNCTION, /* add new commands above here */ @@ -1899,6 +1916,9 @@ enum nl80211_commands { * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see * &enum nl80211_nan_dual_band_conf). This attribute is used with * %NL80211_CMD_START_NAN. + * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See + * &enum nl80211_nan_func_attributes for description of this nested + * attribute. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2296,6 +2316,7 @@ enum nl80211_attrs { NL80211_ATTR_NAN_MASTER_PREF, NL80211_ATTR_NAN_DUAL, + NL80211_ATTR_NAN_FUNC, /* add attributes here, update the policy in nl80211.c */ @@ -4917,4 +4938,133 @@ enum nl80211_nan_dual_band_conf { NL80211_NAN_BAND_5GHZ = 1 << 2, }; +/** + * enum nl80211_nan_function_type - NAN function type + * + * Defines the function type of a NAN function + * + * @NL80211_NAN_FUNC_PUBLISH: function is publish + * @NL80211_NAN_FUNC_SUBSCRIBE: function is subscribe + * @NL80211_NAN_FUNC_FOLLOW_UP: function is follow-up + */ +enum nl80211_nan_function_type { + NL80211_NAN_FUNC_PUBLISH, + NL80211_NAN_FUNC_SUBSCRIBE, + NL80211_NAN_FUNC_FOLLOW_UP, + + /* keep last */ + __NL80211_NAN_FUNC_TYPE_AFTER_LAST, + NL80211_NAN_FUNC_MAX_TYPE = __NL80211_NAN_FUNC_TYPE_AFTER_LAST - 1, +}; + +/** + * enum nl80211_nan_publish_type - NAN publish tx type + * + * Defines how to send publish Service Discovery Frames + * + * @NL80211_NAN_SOLICITED_PUBLISH: publish function is solicited + * @NL80211_NAN_UNSOLICITED_PUBLISH: publish function is unsolicited + */ +enum nl80211_nan_publish_type { + NL80211_NAN_SOLICITED_PUBLISH = 1 << 0, + NL80211_NAN_UNSOLICITED_PUBLISH = 1 << 1, +}; + +#define NL80211_NAN_FUNC_SERVICE_ID_LEN 6 +#define NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN 0xff +#define NL80211_NAN_FUNC_SRF_MAX_LEN 0xff + +/** + * enum nl80211_nan_func_attributes - NAN function attributes + * @__NL80211_NAN_FUNC_INVALID: invalid + * @NL80211_NAN_FUNC_TYPE: &enum nl80211_nan_function_type (u8). + * @NL80211_NAN_FUNC_SERVICE_ID: 6 bytes of the service ID hash as + * specified in NAN spec. This is a binary attribute. + * @NL80211_NAN_FUNC_PUBLISH_TYPE: relevant if the function's type is + * publish. Defines the transmission type for the publish Service Discovery + * Frame, see &enum nl80211_nan_publish_type. Its type is u8. + * @NL80211_NAN_FUNC_PUBLISH_BCAST: relevant if the function is a solicited + * publish. Should the solicited publish Service Discovery Frame be sent to + * the NAN Broadcast address. This is a flag. + * @NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE: relevant if the function's type is + * subscribe. Is the subscribe active. This is a flag. + * @NL80211_NAN_FUNC_FOLLOW_UP_ID: relevant if the function's type is follow up. + * The instance ID for the follow up Service Discovery Frame. This is u8. + * @NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID: relevant if the function's type + * is follow up. This is a u8. + * The requestor instance ID for the follow up Service Discovery Frame. + * @NL80211_NAN_FUNC_FOLLOW_UP_DEST: the MAC address of the recipient of the + * follow up Service Discovery Frame. This is a binary attribute. + * @NL80211_NAN_FUNC_CLOSE_RANGE: is this function limited for devices in a + * close range. The range itself (RSSI) is defined by the device. + * This is a flag. + * @NL80211_NAN_FUNC_TTL: strictly positive number of DWs this function should + * stay active. If not present infinite TTL is assumed. This is a u32. + * @NL80211_NAN_FUNC_SERVICE_INFO: array of bytes describing the service + * specific info. This is a binary attribute. + * @NL80211_NAN_FUNC_SRF: Service Receive Filter. This is a nested attribute. + * See &enum nl80211_nan_srf_attributes. + * @NL80211_NAN_FUNC_RX_MATCH_FILTER: Receive Matching filter. This is a nested + * attribute. It is a list of binary values. + * @NL80211_NAN_FUNC_TX_MATCH_FILTER: Transmit Matching filter. This is a + * nested attribute. It is a list of binary values. + * @NL80211_NAN_FUNC_INSTANCE_ID: The instance ID of the function. + * Its type is u8 and it cannot be 0. + * @NL80211_NAN_FUNC_TERM_REASON: NAN function termination reason. + * See &enum nl80211_nan_func_term_reason. + * + * @NUM_NL80211_NAN_FUNC_ATTR: internal + * @NL80211_NAN_FUNC_ATTR_MAX: highest NAN function attribute + */ +enum nl80211_nan_func_attributes { + __NL80211_NAN_FUNC_INVALID, + NL80211_NAN_FUNC_TYPE, + NL80211_NAN_FUNC_SERVICE_ID, + NL80211_NAN_FUNC_PUBLISH_TYPE, + NL80211_NAN_FUNC_PUBLISH_BCAST, + NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE, + NL80211_NAN_FUNC_FOLLOW_UP_ID, + NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID, + NL80211_NAN_FUNC_FOLLOW_UP_DEST, + NL80211_NAN_FUNC_CLOSE_RANGE, + NL80211_NAN_FUNC_TTL, + NL80211_NAN_FUNC_SERVICE_INFO, + NL80211_NAN_FUNC_SRF, + NL80211_NAN_FUNC_RX_MATCH_FILTER, + NL80211_NAN_FUNC_TX_MATCH_FILTER, + NL80211_NAN_FUNC_INSTANCE_ID, + NL80211_NAN_FUNC_TERM_REASON, + + /* keep last */ + NUM_NL80211_NAN_FUNC_ATTR, + NL80211_NAN_FUNC_ATTR_MAX = NUM_NL80211_NAN_FUNC_ATTR - 1 +}; + +/** + * enum nl80211_nan_srf_attributes - NAN Service Response filter attributes + * @__NL80211_NAN_SRF_INVALID: invalid + * @NL80211_NAN_SRF_INCLUDE: present if the include bit of the SRF set. + * This is a flag. + * @NL80211_NAN_SRF_BF: Bloom Filter. Present if and only if + * &NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary. + * @NL80211_NAN_SRF_BF_IDX: index of the Bloom Filter. Mandatory if + * &NL80211_NAN_SRF_BF is present. This is a u8. + * @NL80211_NAN_SRF_MAC_ADDRS: list of MAC addresses for the SRF. Present if + * and only if &NL80211_NAN_SRF_BF isn't present. This is a nested + * attribute. Each nested attribute is a MAC address. + * @NUM_NL80211_NAN_SRF_ATTR: internal + * @NL80211_NAN_SRF_ATTR_MAX: highest NAN SRF attribute + */ +enum nl80211_nan_srf_attributes { + __NL80211_NAN_SRF_INVALID, + NL80211_NAN_SRF_INCLUDE, + NL80211_NAN_SRF_BF, + NL80211_NAN_SRF_BF_IDX, + NL80211_NAN_SRF_MAC_ADDRS, + + /* keep last */ + NUM_NL80211_NAN_SRF_ATTR, + NL80211_NAN_SRF_ATTR_MAX = NUM_NL80211_NAN_SRF_ATTR - 1, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/core.c b/net/wireless/core.c index 013987243c0b..8201e6d7449e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -605,7 +605,8 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && - (!rdev->ops->start_nan || !rdev->ops->stop_nan))) + (!rdev->ops->start_nan || !rdev->ops->stop_nan || + !rdev->ops->add_nan_func || !rdev->ops->del_nan_func))) return -EINVAL; /* diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9e9fb37087fc..0eca59ccd685 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,6 +411,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, + [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, }; /* policy for the key attributes */ @@ -504,6 +505,39 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = { }, }; +/* policy for NAN function attributes */ +static const struct nla_policy +nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { + [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_SERVICE_ID] = { .type = NLA_BINARY, + .len = NL80211_NAN_FUNC_SERVICE_ID_LEN }, + [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG }, + [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG }, + [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { .len = ETH_ALEN }, + [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG }, + [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 }, + [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY, + .len = NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN }, + [NL80211_NAN_FUNC_SRF] = { .type = NLA_NESTED }, + [NL80211_NAN_FUNC_RX_MATCH_FILTER] = { .type = NLA_NESTED }, + [NL80211_NAN_FUNC_TX_MATCH_FILTER] = { .type = NLA_NESTED }, + [NL80211_NAN_FUNC_INSTANCE_ID] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_TERM_REASON] = { .type = NLA_U8 }, +}; + +/* policy for Service Response Filter attributes */ +static const struct nla_policy +nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = { + [NL80211_NAN_SRF_INCLUDE] = { .type = NLA_FLAG }, + [NL80211_NAN_SRF_BF] = { .type = NLA_BINARY, + .len = NL80211_NAN_FUNC_SRF_MAX_LEN }, + [NL80211_NAN_SRF_BF_IDX] = { .type = NLA_U8 }, + [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -10566,6 +10600,325 @@ static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info) return 0; } +static int validate_nan_filter(struct nlattr *filter_attr) +{ + struct nlattr *attr; + int len = 0, n_entries = 0, rem; + + nla_for_each_nested(attr, filter_attr, rem) { + len += nla_len(attr); + n_entries++; + } + + if (len >= U8_MAX) + return -EINVAL; + + return n_entries; +} + +static int handle_nan_filter(struct nlattr *attr_filter, + struct cfg80211_nan_func *func, + bool tx) +{ + struct nlattr *attr; + int n_entries, rem, i; + struct cfg80211_nan_func_filter *filter; + + n_entries = validate_nan_filter(attr_filter); + if (n_entries < 0) + return n_entries; + + BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters)); + + filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL); + if (!filter) + return -ENOMEM; + + i = 0; + nla_for_each_nested(attr, attr_filter, rem) { + filter[i].filter = kmemdup(nla_data(attr), nla_len(attr), + GFP_KERNEL); + filter[i].len = nla_len(attr); + i++; + } + if (tx) { + func->num_tx_filters = n_entries; + func->tx_filters = filter; + } else { + func->num_rx_filters = n_entries; + func->rx_filters = filter; + } + + return 0; +} + +static int nl80211_nan_add_func(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct nlattr *tb[NUM_NL80211_NAN_FUNC_ATTR], *func_attr; + struct cfg80211_nan_func *func; + struct sk_buff *msg = NULL; + void *hdr = NULL; + int err = 0; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!wdev->nan_started) + return -ENOTCONN; + + if (!info->attrs[NL80211_ATTR_NAN_FUNC]) + return -EINVAL; + + if (wdev->owner_nlportid && + wdev->owner_nlportid != info->snd_portid) + return -ENOTCONN; + + err = nla_parse(tb, NL80211_NAN_FUNC_ATTR_MAX, + nla_data(info->attrs[NL80211_ATTR_NAN_FUNC]), + nla_len(info->attrs[NL80211_ATTR_NAN_FUNC]), + nl80211_nan_func_policy); + if (err) + return err; + + func = kzalloc(sizeof(*func), GFP_KERNEL); + if (!func) + return -ENOMEM; + + func->cookie = wdev->wiphy->cookie_counter++; + + if (!tb[NL80211_NAN_FUNC_TYPE] || + nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) { + err = -EINVAL; + goto out; + } + + + func->type = nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]); + + if (!tb[NL80211_NAN_FUNC_SERVICE_ID]) { + err = -EINVAL; + goto out; + } + + memcpy(func->service_id, nla_data(tb[NL80211_NAN_FUNC_SERVICE_ID]), + sizeof(func->service_id)); + + func->close_range = + nla_get_flag(tb[NL80211_NAN_FUNC_CLOSE_RANGE]); + + if (tb[NL80211_NAN_FUNC_SERVICE_INFO]) { + func->serv_spec_info_len = + nla_len(tb[NL80211_NAN_FUNC_SERVICE_INFO]); + func->serv_spec_info = + kmemdup(nla_data(tb[NL80211_NAN_FUNC_SERVICE_INFO]), + func->serv_spec_info_len, + GFP_KERNEL); + if (!func->serv_spec_info) { + err = -ENOMEM; + goto out; + } + } + + if (tb[NL80211_NAN_FUNC_TTL]) + func->ttl = nla_get_u32(tb[NL80211_NAN_FUNC_TTL]); + + switch (func->type) { + case NL80211_NAN_FUNC_PUBLISH: + if (!tb[NL80211_NAN_FUNC_PUBLISH_TYPE]) { + err = -EINVAL; + goto out; + } + + func->publish_type = + nla_get_u8(tb[NL80211_NAN_FUNC_PUBLISH_TYPE]); + func->publish_bcast = + nla_get_flag(tb[NL80211_NAN_FUNC_PUBLISH_BCAST]); + + if ((!(func->publish_type & NL80211_NAN_SOLICITED_PUBLISH)) && + func->publish_bcast) { + err = -EINVAL; + goto out; + } + break; + case NL80211_NAN_FUNC_SUBSCRIBE: + func->subscribe_active = + nla_get_flag(tb[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE]); + break; + case NL80211_NAN_FUNC_FOLLOW_UP: + if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] || + !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) { + err = -EINVAL; + goto out; + } + + func->followup_id = + nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_ID]); + func->followup_reqid = + nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]); + memcpy(func->followup_dest.addr, + nla_data(tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]), + sizeof(func->followup_dest.addr)); + if (func->ttl) { + err = -EINVAL; + goto out; + } + break; + default: + err = -EINVAL; + goto out; + } + + if (tb[NL80211_NAN_FUNC_SRF]) { + struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR]; + + err = nla_parse(srf_tb, NL80211_NAN_SRF_ATTR_MAX, + nla_data(tb[NL80211_NAN_FUNC_SRF]), + nla_len(tb[NL80211_NAN_FUNC_SRF]), NULL); + if (err) + goto out; + + func->srf_include = + nla_get_flag(srf_tb[NL80211_NAN_SRF_INCLUDE]); + + if (srf_tb[NL80211_NAN_SRF_BF]) { + if (srf_tb[NL80211_NAN_SRF_MAC_ADDRS] || + !srf_tb[NL80211_NAN_SRF_BF_IDX]) { + err = -EINVAL; + goto out; + } + + func->srf_bf_len = + nla_len(srf_tb[NL80211_NAN_SRF_BF]); + func->srf_bf = + kmemdup(nla_data(srf_tb[NL80211_NAN_SRF_BF]), + func->srf_bf_len, GFP_KERNEL); + if (!func->srf_bf) { + err = -ENOMEM; + goto out; + } + + func->srf_bf_idx = + nla_get_u8(srf_tb[NL80211_NAN_SRF_BF_IDX]); + } else { + struct nlattr *attr, *mac_attr = + srf_tb[NL80211_NAN_SRF_MAC_ADDRS]; + int n_entries, rem, i = 0; + + if (!mac_attr) { + err = -EINVAL; + goto out; + } + + n_entries = validate_acl_mac_addrs(mac_attr); + if (n_entries <= 0) { + err = -EINVAL; + goto out; + } + + func->srf_num_macs = n_entries; + func->srf_macs = + kzalloc(sizeof(*func->srf_macs) * n_entries, + GFP_KERNEL); + if (!func->srf_macs) { + err = -ENOMEM; + goto out; + } + + nla_for_each_nested(attr, mac_attr, rem) + memcpy(func->srf_macs[i++].addr, nla_data(attr), + sizeof(*func->srf_macs)); + } + } + + if (tb[NL80211_NAN_FUNC_TX_MATCH_FILTER]) { + err = handle_nan_filter(tb[NL80211_NAN_FUNC_TX_MATCH_FILTER], + func, true); + if (err) + goto out; + } + + if (tb[NL80211_NAN_FUNC_RX_MATCH_FILTER]) { + err = handle_nan_filter(tb[NL80211_NAN_FUNC_RX_MATCH_FILTER], + func, false); + if (err) + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_ADD_NAN_FUNCTION); + /* This can't really happen - we just allocated 4KB */ + if (WARN_ON(!hdr)) { + err = -ENOMEM; + goto out; + } + + err = rdev_add_nan_func(rdev, wdev, func); +out: + if (err < 0) { + cfg80211_free_nan_func(func); + nlmsg_free(msg); + return err; + } + + /* propagate the instance id and cookie to userspace */ + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, func->cookie, + NL80211_ATTR_PAD)) + goto nla_put_failure; + + func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC); + if (!func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, + func->instance_id)) + goto nla_put_failure; + + nla_nest_end(msg, func_attr); + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +static int nl80211_nan_del_func(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + u64 cookie; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!wdev->nan_started) + return -ENOTCONN; + + if (!info->attrs[NL80211_ATTR_COOKIE]) + return -EINVAL; + + if (wdev->owner_nlportid && + wdev->owner_nlportid != info->snd_portid) + return -ENOTCONN; + + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + + rdev_del_nan_func(rdev, wdev, cookie); + + return 0; +} + static int nl80211_get_protocol_features(struct sk_buff *skb, struct genl_info *info) { @@ -11923,6 +12276,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ADD_NAN_FUNCTION, + .doit = nl80211_nan_add_func, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_NAN_FUNCTION, + .doit = nl80211_nan_del_func, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, { .cmd = NL80211_CMD_SET_MCAST_RATE, .doit = nl80211_set_mcast_rate, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index afb68a8428b9..98c4c3bdcb11 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -907,6 +907,27 @@ static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int +rdev_add_nan_func(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_nan_func *nan_func) +{ + int ret; + + trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); + ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, u64 cookie) +{ + trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); + rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); + trace_rdev_return_void(&rdev->wiphy); +} + static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 5f3370f4c6a2..56089843d619 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1916,6 +1916,45 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, TP_ARGS(wiphy, wdev) ); +TRACE_EVENT(rdev_add_nan_func, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + const struct cfg80211_nan_func *func), + TP_ARGS(wiphy, wdev, func), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u8, func_type) + __field(u64, cookie) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->func_type = func->type; + __entry->cookie = func->cookie + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type, + __entry->cookie) +); + +TRACE_EVENT(rdev_del_nan_func, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + u64 cookie), + TP_ARGS(wiphy, wdev, cookie), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u64, cookie) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->cookie = cookie; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie) +); + TRACE_EVENT(rdev_set_mac_acl, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_acl_data *params), diff --git a/net/wireless/util.c b/net/wireless/util.c index 7a2d46b0058a..8edce22d1b93 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1762,6 +1762,28 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, } EXPORT_SYMBOL(cfg80211_get_station); +void cfg80211_free_nan_func(struct cfg80211_nan_func *f) +{ + int i; + + if (!f) + return; + + kfree(f->serv_spec_info); + kfree(f->srf_bf); + kfree(f->srf_macs); + for (i = 0; i < f->num_rx_filters; i++) + kfree(f->rx_filters[i].filter); + + for (i = 0; i < f->num_tx_filters; i++) + kfree(f->tx_filters[i].filter); + + kfree(f->rx_filters); + kfree(f->tx_filters); + kfree(f); +} +EXPORT_SYMBOL(cfg80211_free_nan_func); + /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = -- cgit v1.2.3 From a5a9dcf291e1e541243878eed2d73a74006fa1f1 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:16 +0300 Subject: cfg80211: allow the user space to change current NAN configuration Some NAN configuration paramaters may change during the operation of the NAN device. For example, a user may want to update master preference value when the device gets plugged/unplugged to the power. Add API that allows to do so. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 19 +++++++++++++++++++ include/uapi/linux/nl80211.h | 11 +++++++++-- net/wireless/nl80211.c | 42 ++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 17 +++++++++++++++++ net/wireless/trace.h | 24 ++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2f35ccf6da83..8574a57e19ba 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2326,6 +2326,18 @@ struct cfg80211_nan_conf { u8 dual; }; +/** + * enum cfg80211_nan_conf_changes - indicates changed fields in NAN + * configuration + * + * @CFG80211_NAN_CONF_CHANGED_PREF: master preference + * @CFG80211_NAN_CONF_CHANGED_DUAL: dual band operation + */ +enum cfg80211_nan_conf_changes { + CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), + CFG80211_NAN_CONF_CHANGED_DUAL = BIT(1), +}; + /** * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter * @@ -2691,6 +2703,9 @@ struct cfg80211_nan_func { * On success the driver should assign an instance_id in the * provided @nan_func. * @del_nan_func: Delete a NAN function. + * @nan_change_conf: changes NAN configuration. The changed parameters must + * be specified in @changes (using &enum cfg80211_nan_conf_changes); + * All other parameters must be ignored. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2963,6 +2978,10 @@ struct cfg80211_ops { struct cfg80211_nan_func *nan_func); void (*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie); + int (*nan_change_conf)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, + u32 changes); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e4935d963061..9c9c0c352873 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -862,6 +862,10 @@ * the response to this command. * Look at %NL80211_ATTR_SOCKET_OWNER as well. * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie. + * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN configuration. NAN + * must be operational (%NL80211_CMD_START_NAN was executed). + * It must contain at least one of the following attributes: + * %NL80211_ATTR_NAN_MASTER_PREF, %NL80211_ATTR_NAN_DUAL. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1055,6 +1059,7 @@ enum nl80211_commands { NL80211_CMD_STOP_NAN, NL80211_CMD_ADD_NAN_FUNCTION, NL80211_CMD_DEL_NAN_FUNCTION, + NL80211_CMD_CHANGE_NAN_CONFIG, /* add new commands above here */ @@ -1910,12 +1915,14 @@ enum nl80211_commands { * used to pull the stored data for mesh peer in power save state. * * @NL80211_ATTR_NAN_MASTER_PREF: the master preference to be used by - * %NL80211_CMD_START_NAN. Its type is u8 and it can't be 0. + * %NL80211_CMD_START_NAN and optionally with + * %NL80211_CMD_CHANGE_NAN_CONFIG. Its type is u8 and it can't be 0. * Also, values 1 and 255 are reserved for certification purposes and * should not be used during a normal device operation. * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see * &enum nl80211_nan_dual_band_conf). This attribute is used with - * %NL80211_CMD_START_NAN. + * %NL80211_CMD_START_NAN and optionally with + * %NL80211_CMD_CHANGE_NAN_CONFIG. * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See * &enum nl80211_nan_func_attributes for description of this nested * attribute. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0eca59ccd685..c0b5ae4af2d8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10919,6 +10919,40 @@ static int nl80211_nan_del_func(struct sk_buff *skb, return 0; } +static int nl80211_nan_change_config(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct cfg80211_nan_conf conf = {}; + u32 changed = 0; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!wdev->nan_started) + return -ENOTCONN; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf.master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + if (conf.master_pref <= 1 || conf.master_pref == 255) + return -EINVAL; + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_NAN_DUAL]) { + conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); + changed |= CFG80211_NAN_CONF_CHANGED_DUAL; + } + + if (!changed) + return -EINVAL; + + return rdev_nan_change_conf(rdev, wdev, &conf, changed); +} + static int nl80211_get_protocol_features(struct sk_buff *skb, struct genl_info *info) { @@ -12292,6 +12326,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_CHANGE_NAN_CONFIG, + .doit = nl80211_nan_change_config, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, { .cmd = NL80211_CMD_SET_MCAST_RATE, .doit = nl80211_set_mcast_rate, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 98c4c3bdcb11..11cf83c8ad4f 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -928,6 +928,23 @@ static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int +rdev_nan_change_conf(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, u32 changes) +{ + int ret; + + trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); + if (rdev->ops->nan_change_conf) + ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, + changes); + else + ret = -ENOTSUPP; + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 56089843d619..a3d0a91b1e09 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1911,6 +1911,30 @@ TRACE_EVENT(rdev_start_nan, __entry->dual) ); +TRACE_EVENT(rdev_nan_change_conf, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, u32 changes), + TP_ARGS(wiphy, wdev, conf, changes), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u8, master_pref) + __field(u8, dual); + __field(u32, changes); + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + __entry->changes = changes; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", master preference: %u, dual: %d, changes: %x", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, + __entry->dual, __entry->changes) +); + DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) -- cgit v1.2.3 From 50bcd31d9992e99c231820f5276e70346cbfbc51 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:17 +0300 Subject: cfg80211: provide a function to report a match for NAN Provide a function the driver can call to report a match. This will send the event to the user space. If the NAN instance is tied to the owner, the notifications will be sent to the socket that started the NAN interface only. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 37 ++++++++++++++++++++ include/uapi/linux/nl80211.h | 31 +++++++++++++++++ net/wireless/nl80211.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8574a57e19ba..5481664b5389 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5694,6 +5694,43 @@ wiphy_ext_feature_isset(struct wiphy *wiphy, */ void cfg80211_free_nan_func(struct cfg80211_nan_func *f); +/** + * struct cfg80211_nan_match_params - NAN match parameters + * @type: the type of the function that triggered a match. If it is + * %NL80211_NAN_FUNC_SUBSCRIBE it means that we replied to a subscriber. + * If it is %NL80211_NAN_FUNC_PUBLISH, it means that we got a discovery + * result. + * If it is %NL80211_NAN_FUNC_FOLLOW_UP, we received a follow up. + * @inst_id: the local instance id + * @peer_inst_id: the instance id of the peer's function + * @addr: the MAC address of the peer + * @info_len: the length of the &info + * @info: the Service Specific Info from the peer (if any) + * @cookie: unique identifier of the corresponding function + */ +struct cfg80211_nan_match_params { + enum nl80211_nan_function_type type; + u8 inst_id; + u8 peer_inst_id; + const u8 *addr; + u8 info_len; + const u8 *info; + u64 cookie; +}; + +/** + * cfg80211_nan_match - report a match for a NAN function. + * @wdev: the wireless device reporting the match + * @match: match notification parameters + * @gfp: allocation flags + * + * This function reports that the a NAN function had a match. This + * can be a subscribe that had a match or a solicited publish that + * was sent. It can also be a follow up that was received. + */ +void cfg80211_nan_match(struct wireless_dev *wdev, + struct cfg80211_nan_match_params *match, gfp_t gfp); + /* ethtool helper */ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9c9c0c352873..995bf802d604 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -48,6 +48,7 @@ #define NL80211_MULTICAST_GROUP_REG "regulatory" #define NL80211_MULTICAST_GROUP_MLME "mlme" #define NL80211_MULTICAST_GROUP_VENDOR "vendor" +#define NL80211_MULTICAST_GROUP_NAN "nan" #define NL80211_MULTICAST_GROUP_TESTMODE "testmode" /** @@ -866,6 +867,9 @@ * must be operational (%NL80211_CMD_START_NAN was executed). * It must contain at least one of the following attributes: * %NL80211_ATTR_NAN_MASTER_PREF, %NL80211_ATTR_NAN_DUAL. + * @NL80211_CMD_NAN_FUNC_MATCH: Notification sent when a match is reported. + * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and + * %NL80211_ATTR_COOKIE. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1060,6 +1064,7 @@ enum nl80211_commands { NL80211_CMD_ADD_NAN_FUNCTION, NL80211_CMD_DEL_NAN_FUNCTION, NL80211_CMD_CHANGE_NAN_CONFIG, + NL80211_CMD_NAN_MATCH, /* add new commands above here */ @@ -1926,6 +1931,8 @@ enum nl80211_commands { * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See * &enum nl80211_nan_func_attributes for description of this nested * attribute. + * @NL80211_ATTR_NAN_MATCH: used to report a match. This is a nested attribute. + * See &enum nl80211_nan_match_attributes. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2324,6 +2331,7 @@ enum nl80211_attrs { NL80211_ATTR_NAN_MASTER_PREF, NL80211_ATTR_NAN_DUAL, NL80211_ATTR_NAN_FUNC, + NL80211_ATTR_NAN_MATCH, /* add attributes here, update the policy in nl80211.c */ @@ -5074,4 +5082,27 @@ enum nl80211_nan_srf_attributes { NL80211_NAN_SRF_ATTR_MAX = NUM_NL80211_NAN_SRF_ATTR - 1, }; +/** + * enum nl80211_nan_match_attributes - NAN match attributes + * @__NL80211_NAN_MATCH_INVALID: invalid + * @NL80211_NAN_MATCH_FUNC_LOCAL: the local function that had the + * match. This is a nested attribute. + * See &enum nl80211_nan_func_attributes. + * @NL80211_NAN_MATCH_FUNC_PEER: the peer function + * that caused the match. This is a nested attribute. + * See &enum nl80211_nan_func_attributes. + * + * @NUM_NL80211_NAN_MATCH_ATTR: internal + * @NL80211_NAN_MATCH_ATTR_MAX: highest NAN match attribute + */ +enum nl80211_nan_match_attributes { + __NL80211_NAN_MATCH_INVALID, + NL80211_NAN_MATCH_FUNC_LOCAL, + NL80211_NAN_MATCH_FUNC_PEER, + + /* keep last */ + NUM_NL80211_NAN_MATCH_ATTR, + NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c0b5ae4af2d8..0bbd9ed28318 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -56,6 +56,7 @@ enum nl80211_multicast_groups { NL80211_MCGRP_REGULATORY, NL80211_MCGRP_MLME, NL80211_MCGRP_VENDOR, + NL80211_MCGRP_NAN, NL80211_MCGRP_TESTMODE /* keep last - ifdef! */ }; @@ -65,6 +66,7 @@ static const struct genl_multicast_group nl80211_mcgrps[] = { [NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG }, [NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME }, [NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR }, + [NL80211_MCGRP_NAN] = { .name = NL80211_MULTICAST_GROUP_NAN }, #ifdef CONFIG_NL80211_TESTMODE [NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE } #endif @@ -10953,6 +10955,84 @@ static int nl80211_nan_change_config(struct sk_buff *skb, return rdev_nan_change_conf(rdev, wdev, &conf, changed); } +void cfg80211_nan_match(struct wireless_dev *wdev, + struct cfg80211_nan_match_params *match, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct nlattr *match_attr, *local_func_attr, *peer_func_attr; + struct sk_buff *msg; + void *hdr; + + if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_MATCH); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, + wdev->netdev->ifindex)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, match->cookie, + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr)) + goto nla_put_failure; + + match_attr = nla_nest_start(msg, NL80211_ATTR_NAN_MATCH); + if (!match_attr) + goto nla_put_failure; + + local_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_LOCAL); + if (!local_func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->inst_id)) + goto nla_put_failure; + + nla_nest_end(msg, local_func_attr); + + peer_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_PEER); + if (!peer_func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_TYPE, match->type) || + nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->peer_inst_id)) + goto nla_put_failure; + + if (match->info && match->info_len && + nla_put(msg, NL80211_NAN_FUNC_SERVICE_INFO, match->info_len, + match->info)) + goto nla_put_failure; + + nla_nest_end(msg, peer_func_attr); + nla_nest_end(msg, match_attr); + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->owner_nlportid); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_match); + static int nl80211_get_protocol_features(struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3 From 368e5a7b4ecb71b3d347799cb9351b0dce5dec70 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:18 +0300 Subject: cfg80211: Provide an API to report NAN function termination Provide a function that reports NAN DE function termination. The function may be terminated due to one of the following reasons: user request, ttl expiration or failure. If the NAN instance is tied to the owner, the notification will be sent to the socket that started the NAN interface only Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 16 ++++++++++++ include/uapi/linux/nl80211.h | 18 +++++++++++++ net/wireless/nl80211.c | 60 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5481664b5389..fe78f02a242e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5731,6 +5731,22 @@ struct cfg80211_nan_match_params { void cfg80211_nan_match(struct wireless_dev *wdev, struct cfg80211_nan_match_params *match, gfp_t gfp); +/** + * cfg80211_nan_func_terminated - notify about NAN function termination. + * + * @wdev: the wireless device reporting the match + * @inst_id: the local instance id + * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*) + * @cookie: unique NAN function identifier + * @gfp: allocation flags + * + * This function reports that the a NAN function is terminated. + */ +void cfg80211_nan_func_terminated(struct wireless_dev *wdev, + u8 inst_id, + enum nl80211_nan_func_term_reason reason, + u64 cookie, gfp_t gfp); + /* ethtool helper */ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 995bf802d604..56368e9b4622 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -863,6 +863,9 @@ * the response to this command. * Look at %NL80211_ATTR_SOCKET_OWNER as well. * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie. + * This command is also used as a notification sent when a NAN function is + * terminated. This will contain a %NL80211_ATTR_NAN_FUNC_INST_ID + * and %NL80211_ATTR_COOKIE attributes. * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN configuration. NAN * must be operational (%NL80211_CMD_START_NAN was executed). * It must contain at least one of the following attributes: @@ -4985,6 +4988,21 @@ enum nl80211_nan_publish_type { NL80211_NAN_UNSOLICITED_PUBLISH = 1 << 1, }; +/** + * enum nl80211_nan_func_term_reason - NAN functions termination reason + * + * Defines termination reasons of a NAN function + * + * @NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST: requested by user + * @NL80211_NAN_FUNC_TERM_REASON_TTL_EXPIRED: timeout + * @NL80211_NAN_FUNC_TERM_REASON_ERROR: errored + */ +enum nl80211_nan_func_term_reason { + NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST, + NL80211_NAN_FUNC_TERM_REASON_TTL_EXPIRED, + NL80211_NAN_FUNC_TERM_REASON_ERROR, +}; + #define NL80211_NAN_FUNC_SERVICE_ID_LEN 6 #define NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN 0xff #define NL80211_NAN_FUNC_SRF_MAX_LEN 0xff diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0bbd9ed28318..92eb6f0b9f3d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11033,6 +11033,66 @@ nla_put_failure: } EXPORT_SYMBOL(cfg80211_nan_match); +void cfg80211_nan_func_terminated(struct wireless_dev *wdev, + u8 inst_id, + enum nl80211_nan_func_term_reason reason, + u64 cookie, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + struct nlattr *func_attr; + void *hdr; + + if (WARN_ON(!inst_id)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_NAN_FUNCTION); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, + wdev->netdev->ifindex)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD)) + goto nla_put_failure; + + func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC); + if (!func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, inst_id) || + nla_put_u8(msg, NL80211_NAN_FUNC_TERM_REASON, reason)) + goto nla_put_failure; + + nla_nest_end(msg, func_attr); + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->owner_nlportid); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_func_terminated); + static int nl80211_get_protocol_features(struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3 From 5953ff6d6a3e92dd4f8d9d8e8a9359d7e180ae93 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:19 +0300 Subject: mac80211: implement nan_change_conf Implement nan_change_conf callback which allows to change current NAN configuration (master preference and dual band operation). Store the current NAN configuration in sdata, so it can be used both to provide the driver the updated configuration with changes and also it will be used in hw reconfig flows in next patches. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 9 +++++++++ net/mac80211/cfg.c | 31 +++++++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 21 +++++++++++++++++++++ net/mac80211/ieee80211_i.h | 10 ++++++++++ net/mac80211/trace.h | 31 +++++++++++++++++++++++++++++++ 5 files changed, 102 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index df9b5cff300c..ef8d02a2ce1a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3423,6 +3423,12 @@ enum ieee80211_reconfig_type { * * @start_nan: join an existing NAN cluster, or create a new one. * @stop_nan: leave the NAN cluster. + * @nan_change_conf: change NAN configuration. The data in cfg80211_nan_conf + * contains full new configuration and changes specify which parameters + * are changed with respect to the last NAN config. + * The driver gets both full configuration and the changed parameters since + * some devices may need the full configuration while others need only the + * changed parameters. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -3664,6 +3670,9 @@ struct ieee80211_ops { struct cfg80211_nan_conf *conf); int (*stop_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); + int (*nan_change_conf)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct cfg80211_nan_conf *conf, u32 changes); }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9aabb0932d24..38fdb539cab3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -186,6 +186,36 @@ static void ieee80211_stop_nan(struct wiphy *wiphy, ieee80211_sdata_stop(sdata); } +static int ieee80211_nan_change_conf(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, + u32 changes) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct cfg80211_nan_conf new_conf; + int ret = 0; + + if (sdata->vif.type != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!ieee80211_sdata_running(sdata)) + return -ENETDOWN; + + new_conf = sdata->u.nan.conf; + + if (changes & CFG80211_NAN_CONF_CHANGED_PREF) + new_conf.master_pref = conf->master_pref; + + if (changes & CFG80211_NAN_CONF_CHANGED_DUAL) + new_conf.dual = conf->dual; + + ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); + if (!ret) + sdata->u.nan.conf = new_conf; + + return ret; +} + static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) @@ -3500,4 +3530,5 @@ const struct cfg80211_ops mac80211_config_ops = { .del_tx_ts = ieee80211_del_tx_ts, .start_nan = ieee80211_start_nan, .stop_nan = ieee80211_stop_nan, + .nan_change_conf = ieee80211_nan_change_conf, }; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index e52cfb855bd9..daaa409bec6f 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1192,4 +1192,25 @@ static inline void drv_stop_nan(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline int drv_nan_change_conf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf, + u32 changes) +{ + int ret; + + might_sleep(); + check_sdata_in_driver(sdata); + + if (!local->ops->nan_change_conf) + return -EOPNOTSUPP; + + trace_drv_nan_change_conf(local, sdata, conf, changes); + ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf, + changes); + trace_drv_return_int(local, ret); + + return ret; +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c71c73594790..712b20b05660 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -830,6 +830,15 @@ struct ieee80211_if_mntr { u8 mu_follow_addr[ETH_ALEN] __aligned(2); }; +/** + * struct ieee80211_if_nan - NAN state + * + * @conf: current NAN configuration + */ +struct ieee80211_if_nan { + struct cfg80211_nan_conf conf; +}; + struct ieee80211_sub_if_data { struct list_head list; @@ -929,6 +938,7 @@ struct ieee80211_sub_if_data { struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; struct ieee80211_if_mntr mntr; + struct ieee80211_if_nan nan; } u; #ifdef CONFIG_MAC80211_DEBUGFS diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index deefbfb9f6fb..0bafe1159d01 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1750,6 +1750,37 @@ TRACE_EVENT(drv_stop_nan, ) ); +TRACE_EVENT(drv_nan_change_conf, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf, + u32 changes), + + TP_ARGS(local, sdata, conf, changes), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, master_pref) + __field(u8, dual) + __field(u32, changes) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + __entry->changes = changes; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", master preference: %u, dual: %d, changes: 0x%x", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, + __entry->dual, __entry->changes + ) +); + /* * Tracing for API calls that drivers call. */ -- cgit v1.2.3 From 167e33f4f68cc8e4e3bdaf6d43641176c51f2d79 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:20 +0300 Subject: mac80211: Implement add_nan_func and rm_nan_func Implement add/rm_nan_func functions and handle NAN function termination notifications. Handle instance_id allocation for NAN functions and implement the reconfig flow. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 31 ++++++++++++ net/mac80211/cfg.c | 114 +++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 32 +++++++++++++ net/mac80211/ieee80211_i.h | 7 +++ net/mac80211/iface.c | 20 +++++++- net/mac80211/main.c | 3 ++ net/mac80211/trace.h | 52 +++++++++++++++++++++ net/mac80211/util.c | 48 ++++++++++++++++++- 8 files changed, 304 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ef8d02a2ce1a..d4ddf476dc76 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2177,6 +2177,8 @@ enum ieee80211_hw_flags { * @n_cipher_schemes: a size of an array of cipher schemes definitions. * @cipher_schemes: a pointer to an array of cipher scheme definitions * supported by HW. + * @max_nan_de_entries: maximum number of NAN DE functions supported by the + * device. */ struct ieee80211_hw { struct ieee80211_conf conf; @@ -2211,6 +2213,7 @@ struct ieee80211_hw { u8 uapsd_max_sp_len; u8 n_cipher_schemes; const struct ieee80211_cipher_scheme *cipher_schemes; + u8 max_nan_de_entries; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, @@ -3429,6 +3432,12 @@ enum ieee80211_reconfig_type { * The driver gets both full configuration and the changed parameters since * some devices may need the full configuration while others need only the * changed parameters. + * @add_nan_func: Add a NAN function. Returns 0 on success. The data in + * cfg80211_nan_func must not be referenced outside the scope of + * this call. + * @del_nan_func: Remove a NAN function. The driver must call + * ieee80211_nan_func_terminated() with + * NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -3673,6 +3682,12 @@ struct ieee80211_ops { int (*nan_change_conf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf, u32 changes); + int (*add_nan_func)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + const struct cfg80211_nan_func *nan_func); + void (*del_nan_func)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u8 instance_id); }; /** @@ -5746,4 +5761,20 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt); + +/** + * ieee80211_nan_func_terminated - notify about NAN function termination. + * + * This function is used to notify mac80211 about NAN function termination. + * Note that this function can't be called from hard irq. + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @inst_id: the local instance id + * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*) + * @gfp: allocation flags + */ +void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, + u8 inst_id, + enum nl80211_nan_func_term_reason reason, + gfp_t gfp); #endif /* MAC80211_H */ diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 38fdb539cab3..72ddb4379319 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -174,6 +174,8 @@ static int ieee80211_start_nan(struct wiphy *wiphy, if (ret) ieee80211_sdata_stop(sdata); + sdata->u.nan.conf = *conf; + return ret; } @@ -216,6 +218,84 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, return ret; } +static int ieee80211_add_nan_func(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_func *nan_func) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + int ret; + + if (sdata->vif.type != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!ieee80211_sdata_running(sdata)) + return -ENETDOWN; + + spin_lock_bh(&sdata->u.nan.func_lock); + + ret = idr_alloc(&sdata->u.nan.function_inst_ids, + nan_func, 1, sdata->local->hw.max_nan_de_entries + 1, + GFP_ATOMIC); + spin_unlock_bh(&sdata->u.nan.func_lock); + + if (ret < 0) + return ret; + + nan_func->instance_id = ret; + + WARN_ON(nan_func->instance_id == 0); + + ret = drv_add_nan_func(sdata->local, sdata, nan_func); + if (ret) { + spin_lock_bh(&sdata->u.nan.func_lock); + idr_remove(&sdata->u.nan.function_inst_ids, + nan_func->instance_id); + spin_unlock_bh(&sdata->u.nan.func_lock); + } + + return ret; +} + +static struct cfg80211_nan_func * +ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata, + u64 cookie) +{ + struct cfg80211_nan_func *func; + int id; + + lockdep_assert_held(&sdata->u.nan.func_lock); + + idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) { + if (func->cookie == cookie) + return func; + } + + return NULL; +} + +static void ieee80211_del_nan_func(struct wiphy *wiphy, + struct wireless_dev *wdev, u64 cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct cfg80211_nan_func *func; + u8 instance_id = 0; + + if (sdata->vif.type != NL80211_IFTYPE_NAN || + !ieee80211_sdata_running(sdata)) + return; + + spin_lock_bh(&sdata->u.nan.func_lock); + + func = ieee80211_find_nan_func_by_cookie(sdata, cookie); + if (func) + instance_id = func->instance_id; + + spin_unlock_bh(&sdata->u.nan.func_lock); + + if (instance_id) + drv_del_nan_func(sdata->local, sdata, instance_id); +} + static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) @@ -3443,6 +3523,38 @@ static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev, return -ENOENT; } +void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, + u8 inst_id, + enum nl80211_nan_func_term_reason reason, + gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct cfg80211_nan_func *func; + u64 cookie; + + if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) + return; + + spin_lock_bh(&sdata->u.nan.func_lock); + + func = idr_find(&sdata->u.nan.function_inst_ids, inst_id); + if (WARN_ON(!func)) { + spin_unlock_bh(&sdata->u.nan.func_lock); + return; + } + + cookie = func->cookie; + idr_remove(&sdata->u.nan.function_inst_ids, inst_id); + + spin_unlock_bh(&sdata->u.nan.func_lock); + + cfg80211_free_nan_func(func); + + cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id, + reason, cookie, gfp); +} +EXPORT_SYMBOL(ieee80211_nan_func_terminated); + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3531,4 +3643,6 @@ const struct cfg80211_ops mac80211_config_ops = { .start_nan = ieee80211_start_nan, .stop_nan = ieee80211_stop_nan, .nan_change_conf = ieee80211_nan_change_conf, + .add_nan_func = ieee80211_add_nan_func, + .del_nan_func = ieee80211_del_nan_func, }; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index daaa409bec6f..dea92c33b2ca 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1213,4 +1213,36 @@ static inline int drv_nan_change_conf(struct ieee80211_local *local, return ret; } +static inline int drv_add_nan_func(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct cfg80211_nan_func *nan_func) +{ + int ret; + + might_sleep(); + check_sdata_in_driver(sdata); + + if (!local->ops->add_nan_func) + return -EOPNOTSUPP; + + trace_drv_add_nan_func(local, sdata, nan_func); + ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline void drv_del_nan_func(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u8 instance_id) +{ + might_sleep(); + check_sdata_in_driver(sdata); + + trace_drv_del_nan_func(local, sdata, instance_id); + if (local->ops->del_nan_func) + local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 712b20b05660..2b391f242e58 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,8 @@ struct ieee80211_local; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) +#define IEEE80211_MAX_NAN_INSTANCE_ID 255 + struct ieee80211_fragment_entry { struct sk_buff_head skb_list; unsigned long first_frag_time; @@ -834,9 +836,14 @@ struct ieee80211_if_mntr { * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration + * @func_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; + + /* protects function_inst_ids */ + spinlock_t func_lock; + struct idr function_inst_ids; }; struct ieee80211_sub_if_data { diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 507f46a8eb1c..638ec0759078 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -798,6 +798,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, struct ps_data *ps; struct cfg80211_chan_def chandef; bool cancel_scan; + struct cfg80211_nan_func *func; clear_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -950,11 +951,22 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_adjust_monitor_flags(sdata, -1); break; + case NL80211_IFTYPE_NAN: + /* clean all the functions */ + spin_lock_bh(&sdata->u.nan.func_lock); + + idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, i) { + idr_remove(&sdata->u.nan.function_inst_ids, i); + cfg80211_free_nan_func(func); + } + idr_destroy(&sdata->u.nan.function_inst_ids); + + spin_unlock_bh(&sdata->u.nan.func_lock); + break; case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ RCU_INIT_POINTER(local->p2p_sdata, NULL); /* fall through */ - case NL80211_IFTYPE_NAN: default: cancel_work_sync(&sdata->work); /* @@ -1462,9 +1474,13 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_WDS: sdata->vif.bss_conf.bssid = NULL; break; + case NL80211_IFTYPE_NAN: + idr_init(&sdata->u.nan.function_inst_ids); + spin_lock_init(&sdata->u.nan.func_lock); + sdata->vif.bss_conf.bssid = sdata->vif.addr; + break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_DEVICE: - case NL80211_IFTYPE_NAN: sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/mac80211/main.c b/net/mac80211/main.c index b5cf2c5cc166..1075ac24c8c5 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1063,6 +1063,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->dynamic_ps_forced_timeout = -1; + if (!local->hw.max_nan_de_entries) + local->hw.max_nan_de_entries = IEEE80211_MAX_NAN_INSTANCE_ID; + result = ieee80211_wep_init(local); if (result < 0) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 0bafe1159d01..37891fa67e9a 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1781,6 +1781,58 @@ TRACE_EVENT(drv_nan_change_conf, ) ); +TRACE_EVENT(drv_add_nan_func, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct cfg80211_nan_func *func), + + TP_ARGS(local, sdata, func), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, type) + __field(u8, inst_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->type = func->type; + __entry->inst_id = func->instance_id; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", type: %u, inst_id: %u", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id + ) +); + +TRACE_EVENT(drv_del_nan_func, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u8 instance_id), + + TP_ARGS(local, sdata, instance_id), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, instance_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->instance_id = instance_id; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", instance_id: %u", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id + ) +); + /* * Tracing for API calls that drivers call. */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5b57fcaaec9b..91754c8dafb2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1749,6 +1749,46 @@ static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->sta_mtx); } +static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) +{ + struct cfg80211_nan_func *func, **funcs; + int res, id, i = 0; + + res = drv_start_nan(sdata->local, sdata, + &sdata->u.nan.conf); + if (WARN_ON(res)) + return res; + + funcs = kzalloc((sdata->local->hw.max_nan_de_entries + 1) * + sizeof(*funcs), GFP_KERNEL); + if (!funcs) + return -ENOMEM; + + /* Add all the functions: + * This is a little bit ugly. We need to call a potentially sleeping + * callback for each NAN function, so we can't hold the spinlock. + */ + spin_lock_bh(&sdata->u.nan.func_lock); + + idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) + funcs[i++] = func; + + spin_unlock_bh(&sdata->u.nan.func_lock); + + for (i = 0; funcs[i]; i++) { + res = drv_add_nan_func(sdata->local, sdata, funcs[i]); + if (WARN_ON(res)) + ieee80211_nan_func_terminated(&sdata->vif, + funcs[i]->instance_id, + NL80211_NAN_FUNC_TERM_REASON_ERROR, + GFP_KERNEL); + } + + kfree(funcs); + + return 0; +} + int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; @@ -1972,11 +2012,17 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_bss_info_change_notify(sdata, changed); } break; + case NL80211_IFTYPE_NAN: + res = ieee80211_reconfig_nan(sdata); + if (res < 0) { + ieee80211_handle_reconfig_failure(local); + return res; + } + break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: - case NL80211_IFTYPE_NAN: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: -- cgit v1.2.3 From 92bc43bce2849c814cece258694f167d28524fbd Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 20 Sep 2016 17:31:21 +0300 Subject: mac80211: Add API to report NAN function match Provide an API to report NAN function match. Mac80211 will lookup the corresponding cookie and report the match to cfg80211. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ++++++++++++++++ net/mac80211/cfg.c | 25 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d4ddf476dc76..fc589ba90a48 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5777,4 +5777,20 @@ void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp); + +/** + * ieee80211_nan_func_match - notify about NAN function match event. + * + * This function is used to notify mac80211 about NAN function match. The + * cookie inside the match struct will be assigned by mac80211. + * Note that this function can't be called from hard irq. + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @match: match event information + * @gfp: allocation flags + */ +void ieee80211_nan_func_match(struct ieee80211_vif *vif, + struct cfg80211_nan_match_params *match, + gfp_t gfp); + #endif /* MAC80211_H */ diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 72ddb4379319..fd6541f3ade3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3555,6 +3555,31 @@ void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, } EXPORT_SYMBOL(ieee80211_nan_func_terminated); +void ieee80211_nan_func_match(struct ieee80211_vif *vif, + struct cfg80211_nan_match_params *match, + gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct cfg80211_nan_func *func; + + if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) + return; + + spin_lock_bh(&sdata->u.nan.func_lock); + + func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id); + if (WARN_ON(!func)) { + spin_unlock_bh(&sdata->u.nan.func_lock); + return; + } + match->cookie = func->cookie; + + spin_unlock_bh(&sdata->u.nan.func_lock); + + cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp); +} +EXPORT_SYMBOL(ieee80211_nan_func_match); + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, -- cgit v1.2.3 From 097b065b5cbfa3fd57b47f3c86d6baa96c30bf31 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 23 Sep 2016 21:59:09 +0200 Subject: fq.h: Port memory limit mechanism from fq_codel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reusable fairness queueing implementation (fq.h) lacks the memory usage limit that the fq_codel qdisc has. This means that small devices (e.g. WiFi routers) can run out of memory when flooded with a large number of packets. This ports the memory limit feature from fq_codel to fq.h. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq.h | 3 +++ include/net/fq_impl.h | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/fq.h b/include/net/fq.h index 268b49049c37..6d8521a30c5c 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -72,9 +72,12 @@ struct fq { u32 flows_cnt; u32 perturbation; u32 limit; + u32 memory_limit; + u32 memory_usage; u32 quantum; u32 backlog; u32 overlimit; + u32 overmemory; u32 collisions; }; diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 163f3ed0f05a..4e6131cd3f43 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -29,6 +29,7 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq, tin->backlog_packets--; flow->backlog -= skb->len; fq->backlog--; + fq->memory_usage -= skb->truesize; if (flow->backlog == 0) { list_del_init(&flow->backlogchain); @@ -154,6 +155,7 @@ static void fq_tin_enqueue(struct fq *fq, flow->backlog += skb->len; tin->backlog_bytes += skb->len; tin->backlog_packets++; + fq->memory_usage += skb->truesize; fq->backlog++; fq_recalc_backlog(fq, tin, flow); @@ -166,7 +168,7 @@ static void fq_tin_enqueue(struct fq *fq, __skb_queue_tail(&flow->queue, skb); - if (fq->backlog > fq->limit) { + if (fq->backlog > fq->limit || fq->memory_usage > fq->memory_limit) { flow = list_first_entry_or_null(&fq->backlogs, struct fq_flow, backlogchain); @@ -181,6 +183,8 @@ static void fq_tin_enqueue(struct fq *fq, flow->tin->overlimit++; fq->overlimit++; + if (fq->memory_usage > fq->memory_limit) + fq->overmemory++; } } @@ -251,6 +255,7 @@ static int fq_init(struct fq *fq, int flows_cnt) fq->perturbation = prandom_u32(); fq->quantum = 300; fq->limit = 8192; + fq->memory_limit = 16 << 20; /* 16 MBytes */ fq->flows = kcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) -- cgit v1.2.3 From 354d381baf1126c45d03b5c0d87d22caf938b86b Mon Sep 17 00:00:00 2001 From: "Pedersen, Thomas" Date: Wed, 28 Sep 2016 16:56:28 -0700 Subject: mac80211: add offset_tsf driver op and use it for mesh This allows the mesh sync (and debugfs) code to make incremental TSF adjustments, avoiding any uncertainty introduced by delay in programming absolute TSF. Signed-off-by: Thomas Pedersen Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 ++++++++ net/mac80211/debugfs_netdev.c | 12 +++++++++--- net/mac80211/driver-ops.c | 15 +++++++++++++++ net/mac80211/driver-ops.h | 3 +++ net/mac80211/mesh_sync.c | 10 +++++++--- net/mac80211/trace.h | 26 ++++++++++++++++++++++++++ 6 files changed, 68 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fc589ba90a48..c9f39538ac17 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3169,6 +3169,12 @@ enum ieee80211_reconfig_type { * required function. * The callback can sleep. * + * @offset_tsf: Offset the TSF timer by the specified value in the + * firmware/hardware. Preferred to set_tsf as it avoids delay between + * calling set_tsf() and hardware getting programmed, which will show up + * as TSF delay. Is not a required function. + * The callback can sleep. + * * @reset_tsf: Reset the TSF timer and allow firmware/hardware to synchronize * with other STAs in the IBSS. This is only used in IBSS mode. This * function is optional if the firmware/hardware takes full care of @@ -3549,6 +3555,8 @@ struct ieee80211_ops { u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 tsf); + void (*offset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); int (*ampdu_action)(struct ieee80211_hw *hw, diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 5d35c0f37bb7..bcec1240f41d 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -556,9 +556,15 @@ static ssize_t ieee80211_if_parse_tsf( ret = kstrtoull(buf, 10, &tsf); if (ret < 0) return ret; - if (tsf_is_delta) - tsf = drv_get_tsf(local, sdata) + tsf_is_delta * tsf; - if (local->ops->set_tsf) { + if (tsf_is_delta && local->ops->offset_tsf) { + drv_offset_tsf(local, sdata, tsf_is_delta * tsf); + wiphy_info(local->hw.wiphy, + "debugfs offset TSF by %018lld\n", + tsf_is_delta * tsf); + } else if (local->ops->set_tsf) { + if (tsf_is_delta) + tsf = drv_get_tsf(local, sdata) + + tsf_is_delta * tsf; drv_set_tsf(local, sdata, tsf); wiphy_info(local->hw.wiphy, "debugfs set TSF to %#018llx\n", tsf); diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index c701b6438bd9..bb886e7db47f 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -215,6 +215,21 @@ void drv_set_tsf(struct ieee80211_local *local, trace_drv_return_void(local); } +void drv_offset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + s64 offset) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_offset_tsf(local, sdata, offset); + if (local->ops->offset_tsf) + local->ops->offset_tsf(&local->hw, &sdata->vif, offset); + trace_drv_return_void(local); +} + void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index dea92c33b2ca..09f77e4a8a79 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -569,6 +569,9 @@ u64 drv_get_tsf(struct ieee80211_local *local, void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf); +void drv_offset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + s64 offset); void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index 64bc22ad9496..22ca43c500e4 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -70,9 +70,13 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) } spin_unlock_bh(&ifmsh->sync_offset_lock); - tsf = drv_get_tsf(local, sdata); - if (tsf != -1ULL) - drv_set_tsf(local, sdata, tsf + tsfdelta); + if (local->ops->offset_tsf) { + drv_offset_tsf(local, sdata, tsfdelta); + } else { + tsf = drv_get_tsf(local, sdata); + if (tsf != -1ULL) + drv_set_tsf(local, sdata, tsf + tsfdelta); + } } static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 37891fa67e9a..92a47afaa989 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -984,6 +984,32 @@ TRACE_EVENT(drv_set_tsf, ) ); +TRACE_EVENT(drv_offset_tsf, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + s64 offset), + + TP_ARGS(local, sdata, offset), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(s64, tsf_offset) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->tsf_offset = offset; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " tsf offset:%lld", + LOCAL_PR_ARG, VIF_PR_ARG, + (unsigned long long)__entry->tsf_offset + ) +); + DEFINE_EVENT(local_sdata_evt, drv_reset_tsf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), -- cgit v1.2.3 From bb42f2d13ffcd0baed7547b37d05add51fcd50e1 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 22 Sep 2016 19:04:20 +0200 Subject: mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TXQ intermediate queues can cause packet reordering when more than one flow is active to a single station. Since some of the wifi-specific packet handling (notably sequence number and encryption handling) is sensitive to re-ordering, things break if they are applied before the TXQ. This splits up the TX handlers and fast_xmit logic into two parts: An early part and a late part. The former is applied before TXQ enqueue, and the latter after dequeue. The non-TXQ path just applies both parts at once. Because fragments shouldn't be split up or reordered, the fragmentation handler is run after dequeue. Any fragments are then kept in the TXQ and on subsequent dequeues they take precedence over dequeueing from the FQ structure. This approach avoids having to scatter special cases all over the place for when TXQ is enabled, at the cost of making the fast_xmit and TX handler code slightly more complex. Signed-off-by: Toke Høiland-Jørgensen [fix a few code-style nits, make ieee80211_xmit_fast_finish void, remove a useless txq->sta check] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 + net/mac80211/ieee80211_i.h | 9 ++ net/mac80211/rx.c | 4 +- net/mac80211/sta_info.c | 10 +- net/mac80211/tx.c | 284 ++++++++++++++++++++++++++++++++------------- net/mac80211/util.c | 11 +- 6 files changed, 230 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c9f39538ac17..a810dfcb83c2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -715,6 +715,7 @@ enum mac80211_tx_info_flags { * frame (PS-Poll or uAPSD). * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame + * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * * These flags are used in tx_info->control.flags. */ @@ -723,6 +724,7 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1), IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), IEEE80211_TX_CTRL_AMSDU = BIT(3), + IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), }; /* diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2b391f242e58..8f8bddd5c8d8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -815,12 +815,14 @@ enum txq_info_flags { * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin * @def_cvars: codel vars for @def_flow + * @frags: used to keep fragments created after dequeue */ struct txq_info { struct fq_tin tin; struct fq_flow def_flow; struct codel_vars def_cvars; struct codel_stats cstats; + struct sk_buff_head frags; unsigned long flags; /* keep last! */ @@ -1498,6 +1500,13 @@ static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq) return container_of(txq, struct txq_info, txq); } +static inline bool txq_has_queue(struct ieee80211_txq *txq) +{ + struct txq_info *txqi = to_txq_info(txq); + + return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); +} + static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) { return ether_addr_equal(raddr, addr) || diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c9489a86e6d6..b2fe725881dc 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1323,9 +1323,7 @@ static void sta_ps_start(struct sta_info *sta) return; for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - - if (txqi->tin.backlog_packets) + if (txq_has_queue(sta->sta.txq[tid])) set_bit(tid, &sta->txq_buffered_tids); else clear_bit(tid, &sta->txq_buffered_tids); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 1b1b28ff4fdb..167bff078bdd 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1212,12 +1212,10 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) if (sta->sta.txq[0]) { for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); - - if (!txqi->tin.backlog_packets) + if (!txq_has_queue(sta->sta.txq[i])) continue; - drv_wake_tx_queue(local, txqi); + drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i])); } } @@ -1649,9 +1647,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, return; for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - - if (!(tids & BIT(tid)) || txqi->tin.backlog_packets) + if (!(tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 378a7a6b6dbe..0ea1b0d02186 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -853,8 +853,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) tid = *qc & IEEE80211_QOS_CTL_TID_MASK; tx->sta->tx_stats.msdu[tid]++; - if (!tx->sta->sta.txq[0]) - hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); + hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; } @@ -1404,6 +1403,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, fq_flow_init(&txqi->def_flow); codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); + __skb_queue_head_init(&txqi->frags); txqi->txq.vif = &sdata->vif; @@ -1426,6 +1426,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, struct fq_tin *tin = &txqi->tin; fq_tin_reset(fq, tin, fq_skb_free_func); + ieee80211_purge_tx_queue(&local->hw, &txqi->frags); } int ieee80211_txq_setup_flows(struct ieee80211_local *local) @@ -1495,6 +1496,47 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) spin_unlock_bh(&fq->lock); } +static bool ieee80211_queue_skb(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct fq *fq = &local->fq; + struct ieee80211_vif *vif; + struct txq_info *txqi; + struct ieee80211_sta *pubsta; + + if (!local->ops->wake_tx_queue || + sdata->vif.type == NL80211_IFTYPE_MONITOR) + return false; + + if (sta && sta->uploaded) + pubsta = &sta->sta; + else + pubsta = NULL; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + + vif = &sdata->vif; + txqi = ieee80211_get_txq(local, vif, pubsta, skb); + + if (!txqi) + return false; + + info->control.vif = vif; + + spin_lock_bh(&fq->lock); + ieee80211_txq_enqueue(local, txqi, skb); + spin_unlock_bh(&fq->lock); + + drv_wake_tx_queue(local, txqi); + + return true; +} + static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -1502,9 +1544,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, bool txpending) { struct ieee80211_tx_control control = {}; - struct fq *fq = &local->fq; struct sk_buff *skb, *tmp; - struct txq_info *txqi; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { @@ -1519,21 +1559,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, } #endif - txqi = ieee80211_get_txq(local, vif, sta, skb); - if (txqi) { - info->control.vif = vif; - - __skb_unlink(skb, skbs); - - spin_lock_bh(&fq->lock); - ieee80211_txq_enqueue(local, txqi, skb); - spin_unlock_bh(&fq->lock); - - drv_wake_tx_queue(local, txqi); - - continue; - } - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { @@ -1654,10 +1679,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local, /* * Invoke TX handlers, return 0 on success and non-zero if the * frame was dropped or queued. + * + * The handlers are split into an early and late part. The latter is everything + * that can be sensitive to reordering, and will be deferred to after packets + * are dequeued from the intermediate queues (when they are enabled). */ -static int invoke_tx_handlers(struct ieee80211_tx_data *tx) +static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); ieee80211_tx_result res = TX_DROP; #define CALL_TXH(txh) \ @@ -1675,6 +1703,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); + txh_done: + if (unlikely(res == TX_DROP)) { + I802_DEBUG_INC(tx->local->tx_handlers_drop); + if (tx->skb) + ieee80211_free_txskb(&tx->local->hw, tx->skb); + else + ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); + return -1; + } else if (unlikely(res == TX_QUEUED)) { + I802_DEBUG_INC(tx->local->tx_handlers_queued); + return -1; + } + + return 0; +} + +/* + * Late handlers can be called while the sta lock is held. Handlers that can + * cause packets to be generated will cause deadlock! + */ +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + ieee80211_tx_result res = TX_CONTINUE; + if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { __skb_queue_tail(&tx->skbs, tx->skb); tx->skb = NULL; @@ -1707,6 +1760,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) return 0; } +static int invoke_tx_handlers(struct ieee80211_tx_data *tx) +{ + int r = invoke_tx_handlers_early(tx); + + if (r) + return r; + return invoke_tx_handlers_late(tx); +} + bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta) @@ -1781,7 +1843,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; - if (!invoke_tx_handlers(&tx)) + if (invoke_tx_handlers_early(&tx)) + return false; + + if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) + return true; + + if (!invoke_tx_handlers_late(&tx)) result = __ieee80211_tx(local, &tx.skbs, led_len, tx.sta, txpending); @@ -3125,8 +3193,71 @@ out: return ret; } +/* + * Can be called while the sta lock is held. Anything that can cause packets to + * be generated will cause deadlock! + */ +static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 pn_offs, + struct ieee80211_key *key, + struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *)skb->data; + u8 tid = IEEE80211_NUM_TIDS; + + if (key) + info->control.hw_key = &key->conf; + + ieee80211_tx_stats(skb->dev, skb->len); + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + *ieee80211_get_qos_ctl(hdr) = tid; + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); + } else { + info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; + hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); + sdata->sequence_number += 0x10; + } + + if (skb_shinfo(skb)->gso_size) + sta->tx_stats.msdu[tid] += + DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); + else + sta->tx_stats.msdu[tid]++; + + info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + + /* statistics normally done by ieee80211_tx_h_stats (but that + * has to consider fragmentation, so is more complex) + */ + sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; + + if (pn_offs) { + u64 pn; + u8 *crypto_hdr = skb->data + pn_offs; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + pn = atomic64_inc_return(&key->conf.tx_pn); + crypto_hdr[0] = pn; + crypto_hdr[1] = pn >> 8; + crypto_hdr[4] = pn >> 16; + crypto_hdr[5] = pn >> 24; + crypto_hdr[6] = pn >> 32; + crypto_hdr[7] = pn >> 40; + break; + } + } +} + static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, - struct net_device *dev, struct sta_info *sta, + struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { @@ -3177,8 +3308,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return true; } - ieee80211_tx_stats(dev, skb->len + extra_head); - if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb)) return true; @@ -3207,24 +3336,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | IEEE80211_TX_CTL_DONTFRAG | (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); - - if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { - *ieee80211_get_qos_ctl(hdr) = tid; - if (!sta->sta.txq[0]) - hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); - } else { - info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; - hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); - sdata->sequence_number += 0x10; - } - - if (skb_shinfo(skb)->gso_size) - sta->tx_stats.msdu[tid] += - DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); - else - sta->tx_stats.msdu[tid]++; - - info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT; __skb_queue_head_init(&tx.skbs); @@ -3234,9 +3346,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, tx.sta = sta; tx.key = fast_tx->key; - if (fast_tx->key) - info->control.hw_key = &fast_tx->key->conf; - if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { tx.skb = skb; r = ieee80211_tx_h_rate_ctrl(&tx); @@ -3250,31 +3359,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, } } - /* statistics normally done by ieee80211_tx_h_stats (but that - * has to consider fragmentation, so is more complex) - */ - sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; - sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; + if (ieee80211_queue_skb(local, sdata, sta, skb)) + return true; - if (fast_tx->pn_offs) { - u64 pn; - u8 *crypto_hdr = skb->data + fast_tx->pn_offs; - - switch (fast_tx->key->conf.cipher) { - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_CCMP_256: - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn); - crypto_hdr[0] = pn; - crypto_hdr[1] = pn >> 8; - crypto_hdr[4] = pn >> 16; - crypto_hdr[5] = pn >> 24; - crypto_hdr[6] = pn >> 32; - crypto_hdr[7] = pn >> 40; - break; - } - } + ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, + fast_tx->key, skb); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -3294,12 +3383,21 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct sk_buff *skb = NULL; struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; + struct ieee80211_tx_info *info; + struct ieee80211_tx_data tx; + ieee80211_tx_result r; spin_lock_bh(&fq->lock); if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) goto out; + /* Make sure fragments stay together. */ + skb = __skb_dequeue(&txqi->frags); + if (skb) + goto out; + +begin: skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); if (!skb) goto out; @@ -3307,16 +3405,46 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, ieee80211_set_skb_vif(skb, txqi); hdr = (struct ieee80211_hdr *)skb->data; - if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { + info = IEEE80211_SKB_CB(skb); + + memset(&tx, 0, sizeof(tx)); + __skb_queue_head_init(&tx.skbs); + tx.local = local; + tx.skb = skb; + tx.sdata = vif_to_sdata(info->control.vif); + + if (txq->sta) + tx.sta = container_of(txq->sta, struct sta_info, sta); + + /* + * The key can be removed while the packet was queued, so need to call + * this here to get the current key. + */ + r = ieee80211_tx_h_select_key(&tx); + if (r != TX_CONTINUE) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + + if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, sta); - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + u8 pn_offs = 0; - hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid); - if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) - info->flags |= IEEE80211_TX_CTL_AMPDU; - else - info->flags &= ~IEEE80211_TX_CTL_AMPDU; + if (tx.key && + (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) + pn_offs = ieee80211_hdrlen(hdr->frame_control); + + ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, + tx.key, skb); + } else { + if (invoke_tx_handlers_late(&tx)) + goto begin; + + skb = __skb_dequeue(&tx.skbs); + + if (!skb_queue_empty(&tx.skbs)) + skb_queue_splice_tail(&tx.skbs, &txqi->frags); } out: @@ -3354,7 +3482,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, fast_tx = rcu_dereference(sta->fast_tx); if (fast_tx && - ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb)) + ieee80211_xmit_fast(sdata, sta, fast_tx, skb)) goto out; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 91754c8dafb2..545c79a42a77 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3441,11 +3441,18 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); + u32 frag_cnt = 0, frag_bytes = 0; + struct sk_buff *skb; + + skb_queue_walk(&txqi->frags, skb) { + frag_cnt++; + frag_bytes += skb->len; + } if (frame_cnt) - *frame_cnt = txqi->tin.backlog_packets; + *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) - *byte_cnt = txqi->tin.backlog_bytes; + *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); -- cgit v1.2.3 From df0adc788ae74e35ab1a79f3db878df7fdc7db55 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 26 Sep 2016 22:12:49 +0100 Subject: rxrpc: Keep the call timeouts as ktimes rather than jiffies Keep that call timeouts as ktimes rather than jiffies so that they can be expressed as functions of RTT. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 25 +++++++-------- net/rxrpc/ar-internal.h | 8 ++--- net/rxrpc/call_event.c | 73 +++++++++++++++++++++++--------------------- net/rxrpc/call_object.c | 16 ++++------ net/rxrpc/input.c | 3 +- net/rxrpc/misc.c | 15 ++++++--- net/rxrpc/sendmsg.c | 8 ++--- net/rxrpc/sysctl.c | 8 ++--- 8 files changed, 82 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 67f03946ea4a..0383e5e9a0f3 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -453,17 +453,18 @@ TRACE_EVENT(rxrpc_rtt_rx, TRACE_EVENT(rxrpc_timer, TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why, - unsigned long now), + ktime_t now, unsigned long now_j), - TP_ARGS(call, why, now), + TP_ARGS(call, why, now, now_j), TP_STRUCT__entry( __field(struct rxrpc_call *, call ) __field(enum rxrpc_timer_trace, why ) - __field(unsigned long, now ) - __field(unsigned long, expire_at ) - __field(unsigned long, ack_at ) - __field(unsigned long, resend_at ) + __field_struct(ktime_t, now ) + __field_struct(ktime_t, expire_at ) + __field_struct(ktime_t, ack_at ) + __field_struct(ktime_t, resend_at ) + __field(unsigned long, now_j ) __field(unsigned long, timer ) ), @@ -474,17 +475,17 @@ TRACE_EVENT(rxrpc_timer, __entry->expire_at = call->expire_at; __entry->ack_at = call->ack_at; __entry->resend_at = call->resend_at; + __entry->now_j = now_j; __entry->timer = call->timer.expires; ), - TP_printk("c=%p %s now=%lx x=%ld a=%ld r=%ld t=%ld", + TP_printk("c=%p %s x=%lld a=%lld r=%lld t=%ld", __entry->call, rxrpc_timer_traces[__entry->why], - __entry->now, - __entry->expire_at - __entry->now, - __entry->ack_at - __entry->now, - __entry->resend_at - __entry->now, - __entry->timer - __entry->now) + ktime_to_ns(ktime_sub(__entry->expire_at, __entry->now)), + ktime_to_ns(ktime_sub(__entry->ack_at, __entry->now)), + ktime_to_ns(ktime_sub(__entry->resend_at, __entry->now)), + __entry->timer - __entry->now_j) ); TRACE_EVENT(rxrpc_rx_lose, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 141c1458e719..d38dffd78085 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -464,9 +464,9 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ - unsigned long ack_at; /* When deferred ACK needs to happen */ - unsigned long resend_at; /* When next resend needs to happen */ - unsigned long expire_at; /* When the call times out */ + ktime_t ack_at; /* When deferred ACK needs to happen */ + ktime_t resend_at; /* When next resend needs to happen */ + ktime_t expire_at; /* When the call times out */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -805,7 +805,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace); +void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1f6c7633b964..9ff3bb3ffb41 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -24,28 +24,40 @@ /* * Set the timer */ -void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why) +void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, + ktime_t now) { - unsigned long t, now = jiffies; + unsigned long t_j, now_j = jiffies; + ktime_t t; read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { t = call->expire_at; - if (time_before_eq(t, now)) + if (!ktime_after(t, now)) goto out; - if (time_after(call->resend_at, now) && - time_before(call->resend_at, t)) + if (ktime_after(call->resend_at, now) && + ktime_before(call->resend_at, t)) t = call->resend_at; - if (time_after(call->ack_at, now) && - time_before(call->ack_at, t)) + if (ktime_after(call->ack_at, now) && + ktime_before(call->ack_at, t)) t = call->ack_at; - if (call->timer.expires != t || !timer_pending(&call->timer)) { - mod_timer(&call->timer, t); - trace_rxrpc_timer(call, why, now); + t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); + t_j += jiffies; + + /* We have to make sure that the calculated jiffies value falls + * at or after the nsec value, or we may loop ceaselessly + * because the timer times out, but we haven't reached the nsec + * timeout yet. + */ + t_j++; + + if (call->timer.expires != t_j || !timer_pending(&call->timer)) { + mod_timer(&call->timer, t_j); + trace_rxrpc_timer(call, why, now, now_j); } } @@ -62,7 +74,8 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; - unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; + unsigned int expiry = rxrpc_soft_ack_delay; + ktime_t now, ack_at; s8 prior = rxrpc_ack_priority[ack_reason]; /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial @@ -111,7 +124,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, break; } - now = jiffies; if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { _debug("already scheduled"); } else if (immediate || expiry == 0) { @@ -120,11 +132,11 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, background) rxrpc_queue_call(call); } else { - ack_at = now + expiry; - _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now); - if (time_before(ack_at, call->ack_at)) { + now = ktime_get_real(); + ack_at = ktime_add_ms(now, expiry); + if (ktime_before(ack_at, call->ack_at)) { call->ack_at = ack_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_ack); + rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now); } } @@ -157,12 +169,12 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) /* * Perform retransmission of NAK'd and unack'd packets. */ -static void rxrpc_resend(struct rxrpc_call *call) +static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; - ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts; + ktime_t max_age, oldest, ack_ts; int ix; u8 annotation, anno_type, retrans = 0, unacked = 0; @@ -212,14 +224,7 @@ static void rxrpc_resend(struct rxrpc_call *call) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); - call->resend_at = jiffies + - nsecs_to_jiffies(ktime_to_ns(ktime_sub(resend_at, now))) + - 1; /* We have to make sure that the calculated jiffies value - * falls at or after the nsec value, or we shall loop - * ceaselessly because the timer times out, but we haven't - * reached the nsec timeout yet. - */ + call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); if (unacked) rxrpc_congestion_timeout(call); @@ -229,7 +234,7 @@ static void rxrpc_resend(struct rxrpc_call *call) * retransmitting data. */ if (!retrans) { - rxrpc_set_timer(call, rxrpc_timer_set_for_resend); + rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_ns(ack_ts) < call->peer->rtt) @@ -301,7 +306,7 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - unsigned long now; + ktime_t now; rxrpc_see_call(call); @@ -320,15 +325,15 @@ recheck_state: goto out_put; } - now = jiffies; - if (time_after_eq(now, call->expire_at)) { + now = ktime_get_real(); + if (ktime_before(call->expire_at, now)) { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || - time_after_eq(now, call->ack_at)) { + ktime_before(call->ack_at, now)) { call->ack_at = call->expire_at; if (call->ackr_reason) { rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); @@ -337,12 +342,12 @@ recheck_state: } if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) || - time_after_eq(now, call->resend_at)) { - rxrpc_resend(call); + ktime_before(call->resend_at, now)) { + rxrpc_resend(call, now); goto recheck_state; } - rxrpc_set_timer(call, rxrpc_timer_set_for_resend); + rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); /* other events may have been raised since we started checking */ if (call->events && call->state < RXRPC_CALL_COMPLETE) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d4b3293b78fa..456ab752d473 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -19,11 +19,6 @@ #include #include "ar-internal.h" -/* - * Maximum lifetime of a call (in jiffies). - */ -unsigned int rxrpc_max_call_lifetime = 60 * HZ; - const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_UNINITIALISED] = "Uninit ", [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", @@ -77,7 +72,8 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); if (call->state < RXRPC_CALL_COMPLETE) { - trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); + trace_rxrpc_timer(call, rxrpc_timer_expired, + ktime_get_real(), jiffies); rxrpc_queue_call(call); } } @@ -207,14 +203,14 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, */ static void rxrpc_start_call_timer(struct rxrpc_call *call) { - unsigned long expire_at; + ktime_t now = ktime_get_real(), expire_at; - expire_at = jiffies + rxrpc_max_call_lifetime; + expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); call->expire_at = expire_at; call->ack_at = expire_at; call->resend_at = expire_at; - call->timer.expires = expire_at + 1; - rxrpc_set_timer(call, rxrpc_timer_begin); + call->timer.expires = jiffies + LONG_MAX / 2; + rxrpc_set_timer(call, rxrpc_timer_begin, now); } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 5ba35b4a907b..3ad9f75031e3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -328,7 +328,8 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) call->resend_at = call->expire_at; call->ack_at = call->expire_at; spin_unlock_bh(&call->lock); - rxrpc_set_timer(call, rxrpc_timer_init_for_reply); + rxrpc_set_timer(call, rxrpc_timer_init_for_reply, + ktime_get_real()); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 47dddacdbb91..9d1c721bc4e8 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -20,29 +20,34 @@ */ unsigned int rxrpc_max_backlog __read_mostly = 10; +/* + * Maximum lifetime of a call (in mx). + */ +unsigned int rxrpc_max_call_lifetime = 60 * 1000; + /* * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in jiffies). + * packet with RXRPC_REQUEST_ACK set (in ms). */ unsigned int rxrpc_requested_ack_delay = 1; /* - * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * How long to wait before scheduling an ACK with subtype DELAY (in ms). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned int rxrpc_soft_ack_delay = 1 * HZ; +unsigned int rxrpc_soft_ack_delay = 1 * 1000; /* - * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * How long to wait before scheduling an ACK with subtype IDLE (in ms). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; +unsigned int rxrpc_idle_ack_delay = 0.5 * 1000; /* * Receive window size in packets. This indicates the maximum number of diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index d8dfdce874d8..3322543d460a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -149,13 +149,13 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); } else { - unsigned long resend_at; + ktime_t now = ktime_get_real(), resend_at; - resend_at = jiffies + msecs_to_jiffies(rxrpc_resend_timeout); + resend_at = ktime_add_ms(now, rxrpc_resend_timeout); - if (time_before(resend_at, call->resend_at)) { + if (ktime_before(resend_at, call->resend_at)) { call->resend_at = resend_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_send); + rxrpc_set_timer(call, rxrpc_timer_set_for_send, now); } } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 13d1df03ebac..34c706d2f79c 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -35,7 +35,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_requested_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&zero, }, { @@ -43,7 +43,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_soft_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { @@ -51,7 +51,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_idle_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { @@ -85,7 +85,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_max_call_lifetime, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, -- cgit v1.2.3 From 9095e10edd28e1e4a10ba5ca61fb54d9f74f8968 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 30 Sep 2016 19:08:06 +0200 Subject: mpls: move mpls_hdr to a common location This will be also used by openvswitch. Signed-off-by: Jiri Benc Acked-by: David Ahern Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/mpls.h | 9 +++++++++ net/mpls/internal.h | 10 +--------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/mpls.h b/include/net/mpls.h index 5b3b5addfb08..3ebbc0bb57ff 100644 --- a/include/net/mpls.h +++ b/include/net/mpls.h @@ -19,12 +19,21 @@ #define MPLS_HLEN 4 +struct mpls_shim_hdr { + __be32 label_stack_entry; +}; + static inline bool eth_p_mpls(__be16 eth_type) { return eth_type == htons(ETH_P_MPLS_UC) || eth_type == htons(ETH_P_MPLS_MC); } +static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) +{ + return (struct mpls_shim_hdr *)skb_network_header(skb); +} + /* * For non-MPLS skbs this will correspond to the network header. * For MPLS skbs it will be before the network_header as the MPLS diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 732a5c17e986..bdfef6c3271a 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -1,9 +1,6 @@ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H - -struct mpls_shim_hdr { - __be32 label_stack_entry; -}; +#include struct mpls_entry_decoded { u32 label; @@ -93,11 +90,6 @@ struct mpls_route { /* next hop label forwarding entry */ #define endfor_nexthops(rt) } -static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) -{ - return (struct mpls_shim_hdr *)skb_network_header(skb); -} - static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) { struct mpls_shim_hdr result; -- cgit v1.2.3 From 85de4a2101acb85c3b1dde465e84596ccca99f2c Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 30 Sep 2016 19:08:07 +0200 Subject: openvswitch: use mpls_hdr skb_mpls_header is equivalent to mpls_hdr now. Use the existing helper instead. Signed-off-by: Jiri Benc Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/mpls.h | 12 ------------ net/openvswitch/actions.c | 24 ++++++++++++------------ 2 files changed, 12 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/net/mpls.h b/include/net/mpls.h index 3ebbc0bb57ff..1dbc669b770e 100644 --- a/include/net/mpls.h +++ b/include/net/mpls.h @@ -33,16 +33,4 @@ static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) { return (struct mpls_shim_hdr *)skb_network_header(skb); } - -/* - * For non-MPLS skbs this will correspond to the network header. - * For MPLS skbs it will be before the network_header as the MPLS - * label stack lies between the end of the mac header and the network - * header. That is, for MPLS skbs the end of the mac header - * is the top of the MPLS label stack. - */ -static inline unsigned char *skb_mpls_header(struct sk_buff *skb) -{ - return skb_mac_header(skb) + skb->mac_len; -} #endif diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 863e992dfbc0..4e03f64709bc 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -160,7 +160,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { - __be32 *new_mpls_lse; + struct mpls_shim_hdr *new_mpls_lse; /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) @@ -180,8 +180,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_reset_mac_header(skb); skb_set_network_header(skb, skb->mac_len); - new_mpls_lse = (__be32 *)skb_mpls_header(skb); - *new_mpls_lse = mpls->mpls_lse; + new_mpls_lse = mpls_hdr(skb); + new_mpls_lse->label_stack_entry = mpls->mpls_lse; skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); @@ -202,7 +202,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (unlikely(err)) return err; - skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), skb->mac_len); @@ -211,10 +211,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_reset_mac_header(skb); skb_set_network_header(skb, skb->mac_len); - /* skb_mpls_header() is used to locate the ethertype - * field correctly in the presence of VLAN tags. + /* mpls_hdr() is used to locate the ethertype field correctly in the + * presence of VLAN tags. */ - hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); update_ethertype(skb, hdr, ethertype); if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; @@ -226,7 +226,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { - __be32 *stack; + struct mpls_shim_hdr *stack; __be32 lse; int err; @@ -234,16 +234,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (unlikely(err)) return err; - stack = (__be32 *)skb_mpls_header(skb); - lse = OVS_MASKED(*stack, *mpls_lse, *mask); + stack = mpls_hdr(skb); + lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(*stack), lse }; + __be32 diff[] = { ~(stack->label_stack_entry), lse }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } - *stack = lse; + stack->label_stack_entry = lse; flow_key->mpls.top_lse = lse; return 0; } -- cgit v1.2.3 From 0a7fb11c23c0fb8f5ad37f285f40348f1ab9ccbd Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Sat, 1 Oct 2016 21:59:55 +0300 Subject: qed: Add Light L2 support Other protocols beside the networking driver need the ability of passing some L2 traffic, usually [although not limited] for the purpose of some management traffic. Signed-off-by: Yuval Mintz Signed-off-by: Ram Amrani Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/Kconfig | 8 + drivers/net/ethernet/qlogic/qed/Makefile | 1 + drivers/net/ethernet/qlogic/qed/qed.h | 9 + drivers/net/ethernet/qlogic/qed/qed_cxt.c | 2 + drivers/net/ethernet/qlogic/qed/qed_dev.c | 120 +- drivers/net/ethernet/qlogic/qed/qed_dev_api.h | 20 + drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1699 ++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_ll2.h | 289 ++++ drivers/net/ethernet/qlogic/qed/qed_main.c | 23 +- drivers/net/ethernet/qlogic/qed/qed_reg_addr.h | 22 + drivers/net/ethernet/qlogic/qed/qed_sp.h | 4 + include/linux/qed/qed_if.h | 1 + include/linux/qed/qed_ll2_if.h | 139 ++ 13 files changed, 2334 insertions(+), 3 deletions(-) create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ll2.c create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ll2.h create mode 100644 include/linux/qed/qed_ll2_if.h (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 6ba48406899e..9eb3b1914cf5 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -88,6 +88,14 @@ config QED ---help--- This enables the support for ... +config QED_LL2 + bool "Qlogic QED Light L2 interface" + default n + depends on QED + ---help--- + This enables support for Light L2 interface which is required + by all qed protocol drivers other than qede. + config QED_SRIOV bool "QLogic QED 25/40/100Gb SR-IOV support" depends on QED && PCI_IOV diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile index 86a5b4f5f870..e067098f10a9 100644 --- a/drivers/net/ethernet/qlogic/qed/Makefile +++ b/drivers/net/ethernet/qlogic/qed/Makefile @@ -4,3 +4,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \ qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \ qed_selftest.o qed_dcbx.o qed_debug.o qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o +qed-$(CONFIG_QED_LL2) += qed_ll2.o diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index 0929582fc82b..91b571a3670b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -72,6 +72,7 @@ struct qed_sb_info; struct qed_sb_attn_info; struct qed_cxt_mngr; struct qed_sb_sp_info; +struct qed_ll2_info; struct qed_mcp_info; struct qed_rt_data { @@ -152,6 +153,7 @@ enum QED_RESOURCES { QED_MAC, QED_VLAN, QED_ILT, + QED_LL2_QUEUE, QED_MAX_RESC, }; @@ -360,6 +362,8 @@ struct qed_hwfn { struct qed_sb_attn_info *p_sb_attn; /* Protocol related */ + bool using_ll2; + struct qed_ll2_info *p_ll2_info; struct qed_pf_params pf_params; bool b_rdma_enabled_in_prs; @@ -564,6 +568,11 @@ struct qed_dev { struct qed_dbg_params dbg_params; +#ifdef CONFIG_QED_LL2 + struct qed_cb_ll2_info *ll2; + u8 ll2_mac_address[ETH_ALEN]; +#endif + const struct firmware *firmware; }; diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index dd579b2ef224..d9bea2a9c9f7 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -1839,6 +1839,8 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn) /* Set the number of required CORE connections */ u32 core_cids = 1; /* SPQ */ + if (p_hwfn->using_ll2) + core_cids += 4; qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_CORE, core_cids, 0); switch (p_hwfn->hw_info.personality) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 13d8b4075b01..9a8e153df841 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -29,6 +29,7 @@ #include "qed_hw.h" #include "qed_init_ops.h" #include "qed_int.h" +#include "qed_ll2.h" #include "qed_mcp.h" #include "qed_reg_addr.h" #include "qed_sp.h" @@ -147,6 +148,9 @@ void qed_resc_free(struct qed_dev *cdev) qed_eq_free(p_hwfn, p_hwfn->p_eq); qed_consq_free(p_hwfn, p_hwfn->p_consq); qed_int_free(p_hwfn); +#ifdef CONFIG_QED_LL2 + qed_ll2_free(p_hwfn, p_hwfn->p_ll2_info); +#endif qed_iov_free(p_hwfn); qed_dmae_info_free(p_hwfn); qed_dcbx_info_free(p_hwfn, p_hwfn->p_dcbx_info); @@ -403,6 +407,9 @@ int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) int qed_resc_alloc(struct qed_dev *cdev) { +#ifdef CONFIG_QED_LL2 + struct qed_ll2_info *p_ll2_info; +#endif struct qed_consq *p_consq; struct qed_eq *p_eq; int i, rc = 0; @@ -513,6 +520,15 @@ int qed_resc_alloc(struct qed_dev *cdev) goto alloc_no_mem; p_hwfn->p_consq = p_consq; +#ifdef CONFIG_QED_LL2 + if (p_hwfn->using_ll2) { + p_ll2_info = qed_ll2_alloc(p_hwfn); + if (!p_ll2_info) + goto alloc_no_mem; + p_hwfn->p_ll2_info = p_ll2_info; + } +#endif + /* DMA info initialization */ rc = qed_dmae_info_alloc(p_hwfn); if (rc) @@ -561,6 +577,10 @@ void qed_resc_setup(struct qed_dev *cdev) qed_int_setup(p_hwfn, p_hwfn->p_main_ptt); qed_iov_setup(p_hwfn, p_hwfn->p_main_ptt); +#ifdef CONFIG_QED_LL2 + if (p_hwfn->using_ll2) + qed_ll2_setup(p_hwfn, p_hwfn->p_ll2_info); +#endif } } @@ -1304,6 +1324,7 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn) resc_num[QED_VLAN] = (ETH_NUM_VLAN_FILTERS - 1 /*For vlan0*/) / num_funcs; resc_num[QED_ILT] = PXP_NUM_ILT_RECORDS_BB / num_funcs; + resc_num[QED_LL2_QUEUE] = MAX_NUM_LL2_RX_QUEUES / num_funcs; for (i = 0; i < QED_MAX_RESC; i++) resc_start[i] = resc_num[i] * enabled_func_idx; @@ -1327,7 +1348,8 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn) "RL = %d start = %d\n" "MAC = %d start = %d\n" "VLAN = %d start = %d\n" - "ILT = %d start = %d\n", + "ILT = %d start = %d\n" + "LL2_QUEUE = %d start = %d\n", p_hwfn->hw_info.resc_num[QED_SB], p_hwfn->hw_info.resc_start[QED_SB], p_hwfn->hw_info.resc_num[QED_L2_QUEUE], @@ -1343,7 +1365,9 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn) p_hwfn->hw_info.resc_num[QED_VLAN], p_hwfn->hw_info.resc_start[QED_VLAN], p_hwfn->hw_info.resc_num[QED_ILT], - p_hwfn->hw_info.resc_start[QED_ILT]); + p_hwfn->hw_info.resc_start[QED_ILT], + RESC_NUM(p_hwfn, QED_LL2_QUEUE), + RESC_START(p_hwfn, QED_LL2_QUEUE)); return 0; } @@ -2133,6 +2157,98 @@ int qed_fw_rss_eng(struct qed_hwfn *p_hwfn, u8 src_id, u8 *dst_id) return 0; } +static void qed_llh_mac_to_filter(u32 *p_high, u32 *p_low, + u8 *p_filter) +{ + *p_high = p_filter[1] | (p_filter[0] << 8); + *p_low = p_filter[5] | (p_filter[4] << 8) | + (p_filter[3] << 16) | (p_filter[2] << 24); +} + +int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u8 *p_filter) +{ + u32 high = 0, low = 0, en; + int i; + + if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn))) + return 0; + + qed_llh_mac_to_filter(&high, &low, p_filter); + + /* Find a free entry and utilize it */ + for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) { + en = qed_rd(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32)); + if (en) + continue; + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_VALUE + + 2 * i * sizeof(u32), low); + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_VALUE + + (2 * i + 1) * sizeof(u32), high); + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32), 0); + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE + + i * sizeof(u32), 0); + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 1); + break; + } + if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE) { + DP_NOTICE(p_hwfn, + "Failed to find an empty LLH filter to utilize\n"); + return -EINVAL; + } + + DP_VERBOSE(p_hwfn, NETIF_MSG_HW, + "mac: %pM is added at %d\n", + p_filter, i); + + return 0; +} + +void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u8 *p_filter) +{ + u32 high = 0, low = 0; + int i; + + if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn))) + return; + + qed_llh_mac_to_filter(&high, &low, p_filter); + + /* Find the entry and clean it */ + for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) { + if (qed_rd(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_VALUE + + 2 * i * sizeof(u32)) != low) + continue; + if (qed_rd(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_VALUE + + (2 * i + 1) * sizeof(u32)) != high) + continue; + + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 0); + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_VALUE + 2 * i * sizeof(u32), 0); + qed_wr(p_hwfn, p_ptt, + NIG_REG_LLH_FUNC_FILTER_VALUE + + (2 * i + 1) * sizeof(u32), 0); + + DP_VERBOSE(p_hwfn, NETIF_MSG_HW, + "mac: %pM is removed from %d\n", + p_filter, i); + break; + } + if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE) + DP_NOTICE(p_hwfn, "Tried to remove a non-configured filter\n"); +} + static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u32 hw_addr, void *p_eth_qzone, size_t eth_qzone_size, u8 timeset) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h index 343bb0344f62..b6711c106597 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h @@ -309,6 +309,26 @@ int qed_fw_rss_eng(struct qed_hwfn *p_hwfn, u8 src_id, u8 *dst_id); +/** + * @brief qed_llh_add_mac_filter - configures a MAC filter in llh + * + * @param p_hwfn + * @param p_ptt + * @param p_filter - MAC to add + */ +int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u8 *p_filter); + +/** + * @brief qed_llh_remove_mac_filter - removes a MAC filter from llh + * + * @param p_hwfn + * @param p_ptt + * @param p_filter - MAC to remove + */ +void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u8 *p_filter); + /** * *@brief Cleanup of previous driver remains prior to load * diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c new file mode 100644 index 000000000000..e0ec8ed2f92c --- /dev/null +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -0,0 +1,1699 @@ +/* QLogic qed NIC Driver + * + * Copyright (c) 2015 QLogic Corporation + * + * This software is available under the terms of the GNU General Public License + * (GPL) Version 2, available from the file COPYING in the main directory of + * this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "qed.h" +#include "qed_cxt.h" +#include "qed_dev_api.h" +#include "qed_hsi.h" +#include "qed_hw.h" +#include "qed_int.h" +#include "qed_ll2.h" +#include "qed_mcp.h" +#include "qed_reg_addr.h" +#include "qed_sp.h" + +#define QED_LL2_RX_REGISTERED(ll2) ((ll2)->rx_queue.b_cb_registred) +#define QED_LL2_TX_REGISTERED(ll2) ((ll2)->tx_queue.b_cb_registred) + +#define QED_LL2_TX_SIZE (256) +#define QED_LL2_RX_SIZE (4096) + +struct qed_cb_ll2_info { + int rx_cnt; + u32 rx_size; + u8 handle; + bool frags_mapped; + + /* Lock protecting LL2 buffer lists in sleepless context */ + spinlock_t lock; + struct list_head list; + + const struct qed_ll2_cb_ops *cbs; + void *cb_cookie; +}; + +struct qed_ll2_buffer { + struct list_head list; + void *data; + dma_addr_t phys_addr; +}; + +static void qed_ll2b_complete_tx_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, + bool b_last_packet) +{ + struct qed_dev *cdev = p_hwfn->cdev; + struct sk_buff *skb = cookie; + + /* All we need to do is release the mapping */ + dma_unmap_single(&p_hwfn->cdev->pdev->dev, first_frag_addr, + skb_headlen(skb), DMA_TO_DEVICE); + + if (cdev->ll2->cbs && cdev->ll2->cbs->tx_cb) + cdev->ll2->cbs->tx_cb(cdev->ll2->cb_cookie, skb, + b_last_fragment); + + if (cdev->ll2->frags_mapped) + /* Case where mapped frags were received, need to + * free skb with nr_frags marked as 0 + */ + skb_shinfo(skb)->nr_frags = 0; + + dev_kfree_skb_any(skb); +} + +static int qed_ll2_alloc_buffer(struct qed_dev *cdev, + u8 **data, dma_addr_t *phys_addr) +{ + *data = kmalloc(cdev->ll2->rx_size, GFP_ATOMIC); + if (!(*data)) { + DP_INFO(cdev, "Failed to allocate LL2 buffer data\n"); + return -ENOMEM; + } + + *phys_addr = dma_map_single(&cdev->pdev->dev, + ((*data) + NET_SKB_PAD), + cdev->ll2->rx_size, DMA_FROM_DEVICE); + if (dma_mapping_error(&cdev->pdev->dev, *phys_addr)) { + DP_INFO(cdev, "Failed to map LL2 buffer data\n"); + kfree((*data)); + return -ENOMEM; + } + + return 0; +} + +static int qed_ll2_dealloc_buffer(struct qed_dev *cdev, + struct qed_ll2_buffer *buffer) +{ + spin_lock_bh(&cdev->ll2->lock); + + dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr, + cdev->ll2->rx_size, DMA_FROM_DEVICE); + kfree(buffer->data); + list_del(&buffer->list); + + cdev->ll2->rx_cnt--; + if (!cdev->ll2->rx_cnt) + DP_INFO(cdev, "All LL2 entries were removed\n"); + + spin_unlock_bh(&cdev->ll2->lock); + + return 0; +} + +static void qed_ll2_kill_buffers(struct qed_dev *cdev) +{ + struct qed_ll2_buffer *buffer, *tmp_buffer; + + list_for_each_entry_safe(buffer, tmp_buffer, &cdev->ll2->list, list) + qed_ll2_dealloc_buffer(cdev, buffer); +} + +void qed_ll2b_complete_rx_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + struct qed_ll2_rx_packet *p_pkt, + struct core_rx_fast_path_cqe *p_cqe, + bool b_last_packet) +{ + u16 packet_length = le16_to_cpu(p_cqe->packet_length); + struct qed_ll2_buffer *buffer = p_pkt->cookie; + struct qed_dev *cdev = p_hwfn->cdev; + u16 vlan = le16_to_cpu(p_cqe->vlan); + u32 opaque_data_0, opaque_data_1; + u8 pad = p_cqe->placement_offset; + dma_addr_t new_phys_addr; + struct sk_buff *skb; + bool reuse = false; + int rc = -EINVAL; + u8 *new_data; + + opaque_data_0 = le32_to_cpu(p_cqe->opaque_data.data[0]); + opaque_data_1 = le32_to_cpu(p_cqe->opaque_data.data[1]); + + DP_VERBOSE(p_hwfn, + (NETIF_MSG_RX_STATUS | QED_MSG_STORAGE | NETIF_MSG_PKTDATA), + "Got an LL2 Rx completion: [Buffer at phys 0x%llx, offset 0x%02x] Length 0x%04x Parse_flags 0x%04x vlan 0x%04x Opaque data [0x%08x:0x%08x]\n", + (u64)p_pkt->rx_buf_addr, pad, packet_length, + le16_to_cpu(p_cqe->parse_flags.flags), vlan, + opaque_data_0, opaque_data_1); + + if ((cdev->dp_module & NETIF_MSG_PKTDATA) && buffer->data) { + print_hex_dump(KERN_INFO, "", + DUMP_PREFIX_OFFSET, 16, 1, + buffer->data, packet_length, false); + } + + /* Determine if data is valid */ + if (packet_length < ETH_HLEN) + reuse = true; + + /* Allocate a replacement for buffer; Reuse upon failure */ + if (!reuse) + rc = qed_ll2_alloc_buffer(p_hwfn->cdev, &new_data, + &new_phys_addr); + + /* If need to reuse or there's no replacement buffer, repost this */ + if (rc) + goto out_post; + + skb = build_skb(buffer->data, 0); + if (!skb) { + rc = -ENOMEM; + goto out_post; + } + + pad += NET_SKB_PAD; + skb_reserve(skb, pad); + skb_put(skb, packet_length); + skb_checksum_none_assert(skb); + + /* Get parital ethernet information instead of eth_type_trans(), + * Since we don't have an associated net_device. + */ + skb_reset_mac_header(skb); + skb->protocol = eth_hdr(skb)->h_proto; + + /* Pass SKB onward */ + if (cdev->ll2->cbs && cdev->ll2->cbs->rx_cb) { + if (vlan) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan); + cdev->ll2->cbs->rx_cb(cdev->ll2->cb_cookie, skb, + opaque_data_0, opaque_data_1); + } + + /* Update Buffer information and update FW producer */ + buffer->data = new_data; + buffer->phys_addr = new_phys_addr; + +out_post: + rc = qed_ll2_post_rx_buffer(QED_LEADING_HWFN(cdev), cdev->ll2->handle, + buffer->phys_addr, 0, buffer, 1); + + if (rc) + qed_ll2_dealloc_buffer(cdev, buffer); +} + +static struct qed_ll2_info *__qed_ll2_handle_sanity(struct qed_hwfn *p_hwfn, + u8 connection_handle, + bool b_lock, + bool b_only_active) +{ + struct qed_ll2_info *p_ll2_conn, *p_ret = NULL; + + if (connection_handle >= QED_MAX_NUM_OF_LL2_CONNECTIONS) + return NULL; + + if (!p_hwfn->p_ll2_info) + return NULL; + + p_ll2_conn = &p_hwfn->p_ll2_info[connection_handle]; + + if (b_only_active) { + if (b_lock) + mutex_lock(&p_ll2_conn->mutex); + if (p_ll2_conn->b_active) + p_ret = p_ll2_conn; + if (b_lock) + mutex_unlock(&p_ll2_conn->mutex); + } else { + p_ret = p_ll2_conn; + } + + return p_ret; +} + +static struct qed_ll2_info *qed_ll2_handle_sanity(struct qed_hwfn *p_hwfn, + u8 connection_handle) +{ + return __qed_ll2_handle_sanity(p_hwfn, connection_handle, false, true); +} + +static struct qed_ll2_info *qed_ll2_handle_sanity_lock(struct qed_hwfn *p_hwfn, + u8 connection_handle) +{ + return __qed_ll2_handle_sanity(p_hwfn, connection_handle, true, true); +} + +static struct qed_ll2_info *qed_ll2_handle_sanity_inactive(struct qed_hwfn + *p_hwfn, + u8 connection_handle) +{ + return __qed_ll2_handle_sanity(p_hwfn, connection_handle, false, false); +} + +static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle) +{ + bool b_last_packet = false, b_last_frag = false; + struct qed_ll2_tx_packet *p_pkt = NULL; + struct qed_ll2_info *p_ll2_conn; + struct qed_ll2_tx_queue *p_tx; + + p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle); + if (!p_ll2_conn) + return; + + p_tx = &p_ll2_conn->tx_queue; + + while (!list_empty(&p_tx->active_descq)) { + p_pkt = list_first_entry(&p_tx->active_descq, + struct qed_ll2_tx_packet, list_entry); + if (!p_pkt) + break; + + list_del(&p_pkt->list_entry); + b_last_packet = list_empty(&p_tx->active_descq); + list_add_tail(&p_pkt->list_entry, &p_tx->free_descq); + p_tx->cur_completing_packet = *p_pkt; + p_tx->cur_completing_bd_idx = 1; + b_last_frag = p_tx->cur_completing_bd_idx == p_pkt->bd_used; + + qed_ll2b_complete_tx_packet(p_hwfn, p_ll2_conn->my_id, + p_pkt->cookie, + p_pkt->bds_set[0].tx_frag, + b_last_frag, b_last_packet); + } +} + +static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) +{ + struct qed_ll2_info *p_ll2_conn = p_cookie; + struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue; + u16 new_idx = 0, num_bds = 0, num_bds_in_packet = 0; + struct qed_ll2_tx_packet *p_pkt; + bool b_last_frag = false; + unsigned long flags; + int rc = -EINVAL; + + spin_lock_irqsave(&p_tx->lock, flags); + if (p_tx->b_completing_packet) { + rc = -EBUSY; + goto out; + } + + new_idx = le16_to_cpu(*p_tx->p_fw_cons); + num_bds = ((s16)new_idx - (s16)p_tx->bds_idx); + while (num_bds) { + if (list_empty(&p_tx->active_descq)) + goto out; + + p_pkt = list_first_entry(&p_tx->active_descq, + struct qed_ll2_tx_packet, list_entry); + if (!p_pkt) + goto out; + + p_tx->b_completing_packet = true; + p_tx->cur_completing_packet = *p_pkt; + num_bds_in_packet = p_pkt->bd_used; + list_del(&p_pkt->list_entry); + + if (num_bds < num_bds_in_packet) { + DP_NOTICE(p_hwfn, + "Rest of BDs does not cover whole packet\n"); + goto out; + } + + num_bds -= num_bds_in_packet; + p_tx->bds_idx += num_bds_in_packet; + while (num_bds_in_packet--) + qed_chain_consume(&p_tx->txq_chain); + + p_tx->cur_completing_bd_idx = 1; + b_last_frag = p_tx->cur_completing_bd_idx == p_pkt->bd_used; + list_add_tail(&p_pkt->list_entry, &p_tx->free_descq); + + spin_unlock_irqrestore(&p_tx->lock, flags); + qed_ll2b_complete_tx_packet(p_hwfn, + p_ll2_conn->my_id, + p_pkt->cookie, + p_pkt->bds_set[0].tx_frag, + b_last_frag, !num_bds); + spin_lock_irqsave(&p_tx->lock, flags); + } + + p_tx->b_completing_packet = false; + rc = 0; +out: + spin_unlock_irqrestore(&p_tx->lock, flags); + return rc; +} + +static int qed_ll2_rxq_completion_reg(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn, + union core_rx_cqe_union *p_cqe, + unsigned long lock_flags, + bool b_last_cqe) +{ + struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue; + struct qed_ll2_rx_packet *p_pkt = NULL; + + if (!list_empty(&p_rx->active_descq)) + p_pkt = list_first_entry(&p_rx->active_descq, + struct qed_ll2_rx_packet, list_entry); + if (!p_pkt) { + DP_NOTICE(p_hwfn, + "LL2 Rx completion but active_descq is empty\n"); + return -EIO; + } + list_del(&p_pkt->list_entry); + + if (qed_chain_consume(&p_rx->rxq_chain) != p_pkt->rxq_bd) + DP_NOTICE(p_hwfn, + "Mismatch between active_descq and the LL2 Rx chain\n"); + list_add_tail(&p_pkt->list_entry, &p_rx->free_descq); + + spin_unlock_irqrestore(&p_rx->lock, lock_flags); + qed_ll2b_complete_rx_packet(p_hwfn, p_ll2_conn->my_id, + p_pkt, &p_cqe->rx_cqe_fp, b_last_cqe); + spin_lock_irqsave(&p_rx->lock, lock_flags); + + return 0; +} + +static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie) +{ + struct qed_ll2_info *p_ll2_conn = cookie; + struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue; + union core_rx_cqe_union *cqe = NULL; + u16 cq_new_idx = 0, cq_old_idx = 0; + unsigned long flags = 0; + int rc = 0; + + spin_lock_irqsave(&p_rx->lock, flags); + cq_new_idx = le16_to_cpu(*p_rx->p_fw_cons); + cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain); + + while (cq_new_idx != cq_old_idx) { + bool b_last_cqe = (cq_new_idx == cq_old_idx); + + cqe = qed_chain_consume(&p_rx->rcq_chain); + cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain); + + DP_VERBOSE(p_hwfn, + QED_MSG_LL2, + "LL2 [sw. cons %04x, fw. at %04x] - Got Packet of type %02x\n", + cq_old_idx, cq_new_idx, cqe->rx_cqe_sp.type); + + switch (cqe->rx_cqe_sp.type) { + case CORE_RX_CQE_TYPE_SLOW_PATH: + DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n"); + rc = -EINVAL; + break; + case CORE_RX_CQE_TYPE_REGULAR: + rc = qed_ll2_rxq_completion_reg(p_hwfn, p_ll2_conn, + cqe, flags, b_last_cqe); + break; + default: + rc = -EIO; + } + } + + spin_unlock_irqrestore(&p_rx->lock, flags); + return rc; +} + +void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle) +{ + struct qed_ll2_info *p_ll2_conn = NULL; + struct qed_ll2_rx_packet *p_pkt = NULL; + struct qed_ll2_rx_queue *p_rx; + + p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle); + if (!p_ll2_conn) + return; + + p_rx = &p_ll2_conn->rx_queue; + + while (!list_empty(&p_rx->active_descq)) { + dma_addr_t rx_buf_addr; + void *cookie; + bool b_last; + + p_pkt = list_first_entry(&p_rx->active_descq, + struct qed_ll2_rx_packet, list_entry); + if (!p_pkt) + break; + + list_del(&p_pkt->list_entry); + list_add_tail(&p_pkt->list_entry, &p_rx->free_descq); + + rx_buf_addr = p_pkt->rx_buf_addr; + cookie = p_pkt->cookie; + + b_last = list_empty(&p_rx->active_descq); + } +} + +static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn, + u8 action_on_error) +{ + enum qed_ll2_conn_type conn_type = p_ll2_conn->conn_type; + struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue; + struct core_rx_start_ramrod_data *p_ramrod = NULL; + struct qed_spq_entry *p_ent = NULL; + struct qed_sp_init_data init_data; + u16 cqe_pbl_size; + int rc = 0; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = p_ll2_conn->cid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + CORE_RAMROD_RX_QUEUE_START, + PROTOCOLID_CORE, &init_data); + if (rc) + return rc; + + p_ramrod = &p_ent->ramrod.core_rx_queue_start; + + p_ramrod->sb_id = cpu_to_le16(qed_int_get_sp_sb_id(p_hwfn)); + p_ramrod->sb_index = p_rx->rx_sb_index; + p_ramrod->complete_event_flg = 1; + + p_ramrod->mtu = cpu_to_le16(p_ll2_conn->mtu); + DMA_REGPAIR_LE(p_ramrod->bd_base, + p_rx->rxq_chain.p_phys_addr); + cqe_pbl_size = (u16)qed_chain_get_page_cnt(&p_rx->rcq_chain); + p_ramrod->num_of_pbl_pages = cpu_to_le16(cqe_pbl_size); + DMA_REGPAIR_LE(p_ramrod->cqe_pbl_addr, + qed_chain_get_pbl_phys(&p_rx->rcq_chain)); + + p_ramrod->drop_ttl0_flg = p_ll2_conn->rx_drop_ttl0_flg; + p_ramrod->inner_vlan_removal_en = p_ll2_conn->rx_vlan_removal_en; + p_ramrod->queue_id = p_ll2_conn->queue_id; + p_ramrod->main_func_queue = 1; + + if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) && + p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE)) { + p_ramrod->mf_si_bcast_accept_all = 1; + p_ramrod->mf_si_mcast_accept_all = 1; + } else { + p_ramrod->mf_si_bcast_accept_all = 0; + p_ramrod->mf_si_mcast_accept_all = 0; + } + + p_ramrod->action_on_error.error_type = action_on_error; + return qed_spq_post(p_hwfn, p_ent, NULL); +} + +static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn) +{ + enum qed_ll2_conn_type conn_type = p_ll2_conn->conn_type; + struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue; + struct core_tx_start_ramrod_data *p_ramrod = NULL; + struct qed_spq_entry *p_ent = NULL; + struct qed_sp_init_data init_data; + union qed_qm_pq_params pq_params; + u16 pq_id = 0, pbl_size; + int rc = -EINVAL; + + if (!QED_LL2_TX_REGISTERED(p_ll2_conn)) + return 0; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = p_ll2_conn->cid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + CORE_RAMROD_TX_QUEUE_START, + PROTOCOLID_CORE, &init_data); + if (rc) + return rc; + + p_ramrod = &p_ent->ramrod.core_tx_queue_start; + + p_ramrod->sb_id = cpu_to_le16(qed_int_get_sp_sb_id(p_hwfn)); + p_ramrod->sb_index = p_tx->tx_sb_index; + p_ramrod->mtu = cpu_to_le16(p_ll2_conn->mtu); + p_ll2_conn->tx_stats_en = 1; + p_ramrod->stats_en = p_ll2_conn->tx_stats_en; + p_ramrod->stats_id = p_ll2_conn->tx_stats_id; + + DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, + qed_chain_get_pbl_phys(&p_tx->txq_chain)); + pbl_size = qed_chain_get_page_cnt(&p_tx->txq_chain); + p_ramrod->pbl_size = cpu_to_le16(pbl_size); + + memset(&pq_params, 0, sizeof(pq_params)); + pq_params.core.tc = p_ll2_conn->tx_tc; + pq_id = qed_get_qm_pq(p_hwfn, PROTOCOLID_CORE, &pq_params); + p_ramrod->qm_pq_id = cpu_to_le16(pq_id); + + switch (conn_type) { + case QED_LL2_TYPE_ISCSI: + case QED_LL2_TYPE_ISCSI_OOO: + p_ramrod->conn_type = PROTOCOLID_ISCSI; + break; + case QED_LL2_TYPE_ROCE: + p_ramrod->conn_type = PROTOCOLID_ROCE; + break; + default: + p_ramrod->conn_type = PROTOCOLID_ETH; + DP_NOTICE(p_hwfn, "Unknown connection type: %d\n", conn_type); + } + + return qed_spq_post(p_hwfn, p_ent, NULL); +} + +static int qed_sp_ll2_rx_queue_stop(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn) +{ + struct core_rx_stop_ramrod_data *p_ramrod = NULL; + struct qed_spq_entry *p_ent = NULL; + struct qed_sp_init_data init_data; + int rc = -EINVAL; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = p_ll2_conn->cid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + CORE_RAMROD_RX_QUEUE_STOP, + PROTOCOLID_CORE, &init_data); + if (rc) + return rc; + + p_ramrod = &p_ent->ramrod.core_rx_queue_stop; + + p_ramrod->complete_event_flg = 1; + p_ramrod->queue_id = p_ll2_conn->queue_id; + + return qed_spq_post(p_hwfn, p_ent, NULL); +} + +static int qed_sp_ll2_tx_queue_stop(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn) +{ + struct qed_spq_entry *p_ent = NULL; + struct qed_sp_init_data init_data; + int rc = -EINVAL; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = p_ll2_conn->cid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + CORE_RAMROD_TX_QUEUE_STOP, + PROTOCOLID_CORE, &init_data); + if (rc) + return rc; + + return qed_spq_post(p_hwfn, p_ent, NULL); +} + +static int +qed_ll2_acquire_connection_rx(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_info, u16 rx_num_desc) +{ + struct qed_ll2_rx_packet *p_descq; + u32 capacity; + int rc = 0; + + if (!rx_num_desc) + goto out; + + rc = qed_chain_alloc(p_hwfn->cdev, + QED_CHAIN_USE_TO_CONSUME_PRODUCE, + QED_CHAIN_MODE_NEXT_PTR, + QED_CHAIN_CNT_TYPE_U16, + rx_num_desc, + sizeof(struct core_rx_bd), + &p_ll2_info->rx_queue.rxq_chain); + if (rc) { + DP_NOTICE(p_hwfn, "Failed to allocate ll2 rxq chain\n"); + goto out; + } + + capacity = qed_chain_get_capacity(&p_ll2_info->rx_queue.rxq_chain); + p_descq = kcalloc(capacity, sizeof(struct qed_ll2_rx_packet), + GFP_KERNEL); + if (!p_descq) { + rc = -ENOMEM; + DP_NOTICE(p_hwfn, "Failed to allocate ll2 Rx desc\n"); + goto out; + } + p_ll2_info->rx_queue.descq_array = p_descq; + + rc = qed_chain_alloc(p_hwfn->cdev, + QED_CHAIN_USE_TO_CONSUME_PRODUCE, + QED_CHAIN_MODE_PBL, + QED_CHAIN_CNT_TYPE_U16, + rx_num_desc, + sizeof(struct core_rx_fast_path_cqe), + &p_ll2_info->rx_queue.rcq_chain); + if (rc) { + DP_NOTICE(p_hwfn, "Failed to allocate ll2 rcq chain\n"); + goto out; + } + + DP_VERBOSE(p_hwfn, QED_MSG_LL2, + "Allocated LL2 Rxq [Type %08x] with 0x%08x buffers\n", + p_ll2_info->conn_type, rx_num_desc); + +out: + return rc; +} + +static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_info, + u16 tx_num_desc) +{ + struct qed_ll2_tx_packet *p_descq; + u32 capacity; + int rc = 0; + + if (!tx_num_desc) + goto out; + + rc = qed_chain_alloc(p_hwfn->cdev, + QED_CHAIN_USE_TO_CONSUME_PRODUCE, + QED_CHAIN_MODE_PBL, + QED_CHAIN_CNT_TYPE_U16, + tx_num_desc, + sizeof(struct core_tx_bd), + &p_ll2_info->tx_queue.txq_chain); + if (rc) + goto out; + + capacity = qed_chain_get_capacity(&p_ll2_info->tx_queue.txq_chain); + p_descq = kcalloc(capacity, sizeof(struct qed_ll2_tx_packet), + GFP_KERNEL); + if (!p_descq) { + rc = -ENOMEM; + goto out; + } + p_ll2_info->tx_queue.descq_array = p_descq; + + DP_VERBOSE(p_hwfn, QED_MSG_LL2, + "Allocated LL2 Txq [Type %08x] with 0x%08x buffers\n", + p_ll2_info->conn_type, tx_num_desc); + +out: + if (rc) + DP_NOTICE(p_hwfn, + "Can't allocate memory for Tx LL2 with 0x%08x buffers\n", + tx_num_desc); + return rc; +} + +int qed_ll2_acquire_connection(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_params, + u16 rx_num_desc, + u16 tx_num_desc, + u8 *p_connection_handle) +{ + qed_int_comp_cb_t comp_rx_cb, comp_tx_cb; + struct qed_ll2_info *p_ll2_info = NULL; + int rc; + u8 i; + + if (!p_connection_handle || !p_hwfn->p_ll2_info) + return -EINVAL; + + /* Find a free connection to be used */ + for (i = 0; (i < QED_MAX_NUM_OF_LL2_CONNECTIONS); i++) { + mutex_lock(&p_hwfn->p_ll2_info[i].mutex); + if (p_hwfn->p_ll2_info[i].b_active) { + mutex_unlock(&p_hwfn->p_ll2_info[i].mutex); + continue; + } + + p_hwfn->p_ll2_info[i].b_active = true; + p_ll2_info = &p_hwfn->p_ll2_info[i]; + mutex_unlock(&p_hwfn->p_ll2_info[i].mutex); + break; + } + if (!p_ll2_info) + return -EBUSY; + + p_ll2_info->conn_type = p_params->conn_type; + p_ll2_info->mtu = p_params->mtu; + p_ll2_info->rx_drop_ttl0_flg = p_params->rx_drop_ttl0_flg; + p_ll2_info->rx_vlan_removal_en = p_params->rx_vlan_removal_en; + p_ll2_info->tx_tc = p_params->tx_tc; + p_ll2_info->tx_dest = p_params->tx_dest; + p_ll2_info->ai_err_packet_too_big = p_params->ai_err_packet_too_big; + p_ll2_info->ai_err_no_buf = p_params->ai_err_no_buf; + + rc = qed_ll2_acquire_connection_rx(p_hwfn, p_ll2_info, rx_num_desc); + if (rc) + goto q_allocate_fail; + + rc = qed_ll2_acquire_connection_tx(p_hwfn, p_ll2_info, tx_num_desc); + if (rc) + goto q_allocate_fail; + + /* Register callbacks for the Rx/Tx queues */ + comp_rx_cb = qed_ll2_rxq_completion; + comp_tx_cb = qed_ll2_txq_completion; + + if (rx_num_desc) { + qed_int_register_cb(p_hwfn, comp_rx_cb, + &p_hwfn->p_ll2_info[i], + &p_ll2_info->rx_queue.rx_sb_index, + &p_ll2_info->rx_queue.p_fw_cons); + p_ll2_info->rx_queue.b_cb_registred = true; + } + + if (tx_num_desc) { + qed_int_register_cb(p_hwfn, + comp_tx_cb, + &p_hwfn->p_ll2_info[i], + &p_ll2_info->tx_queue.tx_sb_index, + &p_ll2_info->tx_queue.p_fw_cons); + p_ll2_info->tx_queue.b_cb_registred = true; + } + + *p_connection_handle = i; + return rc; + +q_allocate_fail: + qed_ll2_release_connection(p_hwfn, i); + return -ENOMEM; +} + +static int qed_ll2_establish_connection_rx(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn) +{ + u8 action_on_error = 0; + + if (!QED_LL2_RX_REGISTERED(p_ll2_conn)) + return 0; + + DIRECT_REG_WR(p_ll2_conn->rx_queue.set_prod_addr, 0x0); + + SET_FIELD(action_on_error, + CORE_RX_ACTION_ON_ERROR_PACKET_TOO_BIG, + p_ll2_conn->ai_err_packet_too_big); + SET_FIELD(action_on_error, + CORE_RX_ACTION_ON_ERROR_NO_BUFF, p_ll2_conn->ai_err_no_buf); + + return qed_sp_ll2_rx_queue_start(p_hwfn, p_ll2_conn, action_on_error); +} + +int qed_ll2_establish_connection(struct qed_hwfn *p_hwfn, u8 connection_handle) +{ + struct qed_ll2_info *p_ll2_conn; + struct qed_ll2_rx_queue *p_rx; + struct qed_ll2_tx_queue *p_tx; + int rc = -EINVAL; + u32 i, capacity; + u8 qid; + + p_ll2_conn = qed_ll2_handle_sanity_lock(p_hwfn, connection_handle); + if (!p_ll2_conn) + return -EINVAL; + p_rx = &p_ll2_conn->rx_queue; + p_tx = &p_ll2_conn->tx_queue; + + qed_chain_reset(&p_rx->rxq_chain); + qed_chain_reset(&p_rx->rcq_chain); + INIT_LIST_HEAD(&p_rx->active_descq); + INIT_LIST_HEAD(&p_rx->free_descq); + INIT_LIST_HEAD(&p_rx->posting_descq); + spin_lock_init(&p_rx->lock); + capacity = qed_chain_get_capacity(&p_rx->rxq_chain); + for (i = 0; i < capacity; i++) + list_add_tail(&p_rx->descq_array[i].list_entry, + &p_rx->free_descq); + *p_rx->p_fw_cons = 0; + + qed_chain_reset(&p_tx->txq_chain); + INIT_LIST_HEAD(&p_tx->active_descq); + INIT_LIST_HEAD(&p_tx->free_descq); + INIT_LIST_HEAD(&p_tx->sending_descq); + spin_lock_init(&p_tx->lock); + capacity = qed_chain_get_capacity(&p_tx->txq_chain); + for (i = 0; i < capacity; i++) + list_add_tail(&p_tx->descq_array[i].list_entry, + &p_tx->free_descq); + p_tx->cur_completing_bd_idx = 0; + p_tx->bds_idx = 0; + p_tx->b_completing_packet = false; + p_tx->cur_send_packet = NULL; + p_tx->cur_send_frag_num = 0; + p_tx->cur_completing_frag_num = 0; + *p_tx->p_fw_cons = 0; + + qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_CORE, &p_ll2_conn->cid); + + qid = p_hwfn->hw_info.resc_start[QED_LL2_QUEUE] + connection_handle; + p_ll2_conn->queue_id = qid; + p_ll2_conn->tx_stats_id = qid; + p_rx->set_prod_addr = (u8 __iomem *)p_hwfn->regview + + GTT_BAR0_MAP_REG_TSDM_RAM + + TSTORM_LL2_RX_PRODS_OFFSET(qid); + p_tx->doorbell_addr = (u8 __iomem *)p_hwfn->doorbells + + qed_db_addr(p_ll2_conn->cid, + DQ_DEMS_LEGACY); + + rc = qed_ll2_establish_connection_rx(p_hwfn, p_ll2_conn); + if (rc) + return rc; + + rc = qed_sp_ll2_tx_queue_start(p_hwfn, p_ll2_conn); + if (rc) + return rc; + + if (p_hwfn->hw_info.personality != QED_PCI_ETH_ROCE) + qed_wr(p_hwfn, p_hwfn->p_main_ptt, PRS_REG_USE_LIGHT_L2, 1); + + return rc; +} + +static void qed_ll2_post_rx_buffer_notify_fw(struct qed_hwfn *p_hwfn, + struct qed_ll2_rx_queue *p_rx, + struct qed_ll2_rx_packet *p_curp) +{ + struct qed_ll2_rx_packet *p_posting_packet = NULL; + struct core_ll2_rx_prod rx_prod = { 0, 0, 0 }; + bool b_notify_fw = false; + u16 bd_prod, cq_prod; + + /* This handles the flushing of already posted buffers */ + while (!list_empty(&p_rx->posting_descq)) { + p_posting_packet = list_first_entry(&p_rx->posting_descq, + struct qed_ll2_rx_packet, + list_entry); + list_del(&p_posting_packet->list_entry); + list_add_tail(&p_posting_packet->list_entry, + &p_rx->active_descq); + b_notify_fw = true; + } + + /* This handles the supplied packet [if there is one] */ + if (p_curp) { + list_add_tail(&p_curp->list_entry, &p_rx->active_descq); + b_notify_fw = true; + } + + if (!b_notify_fw) + return; + + bd_prod = qed_chain_get_prod_idx(&p_rx->rxq_chain); + cq_prod = qed_chain_get_prod_idx(&p_rx->rcq_chain); + rx_prod.bd_prod = cpu_to_le16(bd_prod); + rx_prod.cqe_prod = cpu_to_le16(cq_prod); + DIRECT_REG_WR(p_rx->set_prod_addr, *((u32 *)&rx_prod)); +} + +int qed_ll2_post_rx_buffer(struct qed_hwfn *p_hwfn, + u8 connection_handle, + dma_addr_t addr, + u16 buf_len, void *cookie, u8 notify_fw) +{ + struct core_rx_bd_with_buff_len *p_curb = NULL; + struct qed_ll2_rx_packet *p_curp = NULL; + struct qed_ll2_info *p_ll2_conn; + struct qed_ll2_rx_queue *p_rx; + unsigned long flags; + void *p_data; + int rc = 0; + + p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle); + if (!p_ll2_conn) + return -EINVAL; + p_rx = &p_ll2_conn->rx_queue; + + spin_lock_irqsave(&p_rx->lock, flags); + if (!list_empty(&p_rx->free_descq)) + p_curp = list_first_entry(&p_rx->free_descq, + struct qed_ll2_rx_packet, list_entry); + if (p_curp) { + if (qed_chain_get_elem_left(&p_rx->rxq_chain) && + qed_chain_get_elem_left(&p_rx->rcq_chain)) { + p_data = qed_chain_produce(&p_rx->rxq_chain); + p_curb = (struct core_rx_bd_with_buff_len *)p_data; + qed_chain_produce(&p_rx->rcq_chain); + } + } + + /* If we're lacking entires, let's try to flush buffers to FW */ + if (!p_curp || !p_curb) { + rc = -EBUSY; + p_curp = NULL; + goto out_notify; + } + + /* We have an Rx packet we can fill */ + DMA_REGPAIR_LE(p_curb->addr, addr); + p_curb->buff_length = cpu_to_le16(buf_len); + p_curp->rx_buf_addr = addr; + p_curp->cookie = cookie; + p_curp->rxq_bd = p_curb; + p_curp->buf_length = buf_len; + list_del(&p_curp->list_entry); + + /* Check if we only want to enqueue this packet without informing FW */ + if (!notify_fw) { + list_add_tail(&p_curp->list_entry, &p_rx->posting_descq); + goto out; + } + +out_notify: + qed_ll2_post_rx_buffer_notify_fw(p_hwfn, p_rx, p_curp); +out: + spin_unlock_irqrestore(&p_rx->lock, flags); + return rc; +} + +static void qed_ll2_prepare_tx_packet_set(struct qed_hwfn *p_hwfn, + struct qed_ll2_tx_queue *p_tx, + struct qed_ll2_tx_packet *p_curp, + u8 num_of_bds, + dma_addr_t first_frag, + u16 first_frag_len, void *p_cookie, + u8 notify_fw) +{ + list_del(&p_curp->list_entry); + p_curp->cookie = p_cookie; + p_curp->bd_used = num_of_bds; + p_curp->notify_fw = notify_fw; + p_tx->cur_send_packet = p_curp; + p_tx->cur_send_frag_num = 0; + + p_curp->bds_set[p_tx->cur_send_frag_num].tx_frag = first_frag; + p_curp->bds_set[p_tx->cur_send_frag_num].frag_len = first_frag_len; + p_tx->cur_send_frag_num++; +} + +static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2, + struct qed_ll2_tx_packet *p_curp, + u8 num_of_bds, + enum core_tx_dest tx_dest, + u16 vlan, + u8 bd_flags, + u16 l4_hdr_offset_w, + dma_addr_t first_frag, + u16 first_frag_len) +{ + struct qed_chain *p_tx_chain = &p_ll2->tx_queue.txq_chain; + u16 prod_idx = qed_chain_get_prod_idx(p_tx_chain); + struct core_tx_bd *start_bd = NULL; + u16 frag_idx; + + start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain); + start_bd->nw_vlan_or_lb_echo = cpu_to_le16(vlan); + SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W, + cpu_to_le16(l4_hdr_offset_w)); + SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest); + start_bd->bd_flags.as_bitfield = bd_flags; + start_bd->bd_flags.as_bitfield |= CORE_TX_BD_FLAGS_START_BD_MASK << + CORE_TX_BD_FLAGS_START_BD_SHIFT; + SET_FIELD(start_bd->bitfield0, CORE_TX_BD_NBDS, num_of_bds); + DMA_REGPAIR_LE(start_bd->addr, first_frag); + start_bd->nbytes = cpu_to_le16(first_frag_len); + + DP_VERBOSE(p_hwfn, + (NETIF_MSG_TX_QUEUED | QED_MSG_LL2), + "LL2 [q 0x%02x cid 0x%08x type 0x%08x] Tx Producer at [0x%04x] - set with a %04x bytes %02x BDs buffer at %08x:%08x\n", + p_ll2->queue_id, + p_ll2->cid, + p_ll2->conn_type, + prod_idx, + first_frag_len, + num_of_bds, + le32_to_cpu(start_bd->addr.hi), + le32_to_cpu(start_bd->addr.lo)); + + if (p_ll2->tx_queue.cur_send_frag_num == num_of_bds) + return; + + /* Need to provide the packet with additional BDs for frags */ + for (frag_idx = p_ll2->tx_queue.cur_send_frag_num; + frag_idx < num_of_bds; frag_idx++) { + struct core_tx_bd **p_bd = &p_curp->bds_set[frag_idx].txq_bd; + + *p_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain); + (*p_bd)->bd_flags.as_bitfield = 0; + (*p_bd)->bitfield1 = 0; + (*p_bd)->bitfield0 = 0; + p_curp->bds_set[frag_idx].tx_frag = 0; + p_curp->bds_set[frag_idx].frag_len = 0; + } +} + +/* This should be called while the Txq spinlock is being held */ +static void qed_ll2_tx_packet_notify(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn) +{ + bool b_notify = p_ll2_conn->tx_queue.cur_send_packet->notify_fw; + struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue; + struct qed_ll2_tx_packet *p_pkt = NULL; + struct core_db_data db_msg = { 0, 0, 0 }; + u16 bd_prod; + + /* If there are missing BDs, don't do anything now */ + if (p_ll2_conn->tx_queue.cur_send_frag_num != + p_ll2_conn->tx_queue.cur_send_packet->bd_used) + return; + + /* Push the current packet to the list and clean after it */ + list_add_tail(&p_ll2_conn->tx_queue.cur_send_packet->list_entry, + &p_ll2_conn->tx_queue.sending_descq); + p_ll2_conn->tx_queue.cur_send_packet = NULL; + p_ll2_conn->tx_queue.cur_send_frag_num = 0; + + /* Notify FW of packet only if requested to */ + if (!b_notify) + return; + + bd_prod = qed_chain_get_prod_idx(&p_ll2_conn->tx_queue.txq_chain); + + while (!list_empty(&p_tx->sending_descq)) { + p_pkt = list_first_entry(&p_tx->sending_descq, + struct qed_ll2_tx_packet, list_entry); + if (!p_pkt) + break; + + list_del(&p_pkt->list_entry); + list_add_tail(&p_pkt->list_entry, &p_tx->active_descq); + } + + SET_FIELD(db_msg.params, CORE_DB_DATA_DEST, DB_DEST_XCM); + SET_FIELD(db_msg.params, CORE_DB_DATA_AGG_CMD, DB_AGG_CMD_SET); + SET_FIELD(db_msg.params, CORE_DB_DATA_AGG_VAL_SEL, + DQ_XCM_CORE_TX_BD_PROD_CMD); + db_msg.agg_flags = DQ_XCM_CORE_DQ_CF_CMD; + db_msg.spq_prod = cpu_to_le16(bd_prod); + + /* Make sure the BDs data is updated before ringing the doorbell */ + wmb(); + + DIRECT_REG_WR(p_tx->doorbell_addr, *((u32 *)&db_msg)); + + DP_VERBOSE(p_hwfn, + (NETIF_MSG_TX_QUEUED | QED_MSG_LL2), + "LL2 [q 0x%02x cid 0x%08x type 0x%08x] Doorbelled [producer 0x%04x]\n", + p_ll2_conn->queue_id, + p_ll2_conn->cid, p_ll2_conn->conn_type, db_msg.spq_prod); +} + +int qed_ll2_prepare_tx_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + u8 num_of_bds, + u16 vlan, + u8 bd_flags, + u16 l4_hdr_offset_w, + dma_addr_t first_frag, + u16 first_frag_len, void *cookie, u8 notify_fw) +{ + struct qed_ll2_tx_packet *p_curp = NULL; + struct qed_ll2_info *p_ll2_conn = NULL; + struct qed_ll2_tx_queue *p_tx; + struct qed_chain *p_tx_chain; + unsigned long flags; + int rc = 0; + + p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle); + if (!p_ll2_conn) + return -EINVAL; + p_tx = &p_ll2_conn->tx_queue; + p_tx_chain = &p_tx->txq_chain; + + if (num_of_bds > CORE_LL2_TX_MAX_BDS_PER_PACKET) + return -EIO; + + spin_lock_irqsave(&p_tx->lock, flags); + if (p_tx->cur_send_packet) { + rc = -EEXIST; + goto out; + } + + /* Get entry, but only if we have tx elements for it */ + if (!list_empty(&p_tx->free_descq)) + p_curp = list_first_entry(&p_tx->free_descq, + struct qed_ll2_tx_packet, list_entry); + if (p_curp && qed_chain_get_elem_left(p_tx_chain) < num_of_bds) + p_curp = NULL; + + if (!p_curp) { + rc = -EBUSY; + goto out; + } + + /* Prepare packet and BD, and perhaps send a doorbell to FW */ + qed_ll2_prepare_tx_packet_set(p_hwfn, p_tx, p_curp, + num_of_bds, first_frag, + first_frag_len, cookie, notify_fw); + qed_ll2_prepare_tx_packet_set_bd(p_hwfn, p_ll2_conn, p_curp, + num_of_bds, CORE_TX_DEST_NW, + vlan, bd_flags, l4_hdr_offset_w, + first_frag, first_frag_len); + + qed_ll2_tx_packet_notify(p_hwfn, p_ll2_conn); + +out: + spin_unlock_irqrestore(&p_tx->lock, flags); + return rc; +} + +int qed_ll2_set_fragment_of_tx_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + dma_addr_t addr, u16 nbytes) +{ + struct qed_ll2_tx_packet *p_cur_send_packet = NULL; + struct qed_ll2_info *p_ll2_conn = NULL; + u16 cur_send_frag_num = 0; + struct core_tx_bd *p_bd; + unsigned long flags; + + p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle); + if (!p_ll2_conn) + return -EINVAL; + + if (!p_ll2_conn->tx_queue.cur_send_packet) + return -EINVAL; + + p_cur_send_packet = p_ll2_conn->tx_queue.cur_send_packet; + cur_send_frag_num = p_ll2_conn->tx_queue.cur_send_frag_num; + + if (cur_send_frag_num >= p_cur_send_packet->bd_used) + return -EINVAL; + + /* Fill the BD information, and possibly notify FW */ + p_bd = p_cur_send_packet->bds_set[cur_send_frag_num].txq_bd; + DMA_REGPAIR_LE(p_bd->addr, addr); + p_bd->nbytes = cpu_to_le16(nbytes); + p_cur_send_packet->bds_set[cur_send_frag_num].tx_frag = addr; + p_cur_send_packet->bds_set[cur_send_frag_num].frag_len = nbytes; + + p_ll2_conn->tx_queue.cur_send_frag_num++; + + spin_lock_irqsave(&p_ll2_conn->tx_queue.lock, flags); + qed_ll2_tx_packet_notify(p_hwfn, p_ll2_conn); + spin_unlock_irqrestore(&p_ll2_conn->tx_queue.lock, flags); + + return 0; +} + +int qed_ll2_terminate_connection(struct qed_hwfn *p_hwfn, u8 connection_handle) +{ + struct qed_ll2_info *p_ll2_conn = NULL; + int rc = -EINVAL; + + p_ll2_conn = qed_ll2_handle_sanity_lock(p_hwfn, connection_handle); + if (!p_ll2_conn) + return -EINVAL; + + /* Stop Tx & Rx of connection, if needed */ + if (QED_LL2_TX_REGISTERED(p_ll2_conn)) { + rc = qed_sp_ll2_tx_queue_stop(p_hwfn, p_ll2_conn); + if (rc) + return rc; + qed_ll2_txq_flush(p_hwfn, connection_handle); + } + + if (QED_LL2_RX_REGISTERED(p_ll2_conn)) { + rc = qed_sp_ll2_rx_queue_stop(p_hwfn, p_ll2_conn); + if (rc) + return rc; + qed_ll2_rxq_flush(p_hwfn, connection_handle); + } + + return rc; +} + +void qed_ll2_release_connection(struct qed_hwfn *p_hwfn, u8 connection_handle) +{ + struct qed_ll2_info *p_ll2_conn = NULL; + + p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle); + if (!p_ll2_conn) + return; + + if (QED_LL2_RX_REGISTERED(p_ll2_conn)) { + p_ll2_conn->rx_queue.b_cb_registred = false; + qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index); + } + + if (QED_LL2_TX_REGISTERED(p_ll2_conn)) { + p_ll2_conn->tx_queue.b_cb_registred = false; + qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index); + } + + kfree(p_ll2_conn->tx_queue.descq_array); + qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain); + + kfree(p_ll2_conn->rx_queue.descq_array); + qed_chain_free(p_hwfn->cdev, &p_ll2_conn->rx_queue.rxq_chain); + qed_chain_free(p_hwfn->cdev, &p_ll2_conn->rx_queue.rcq_chain); + + qed_cxt_release_cid(p_hwfn, p_ll2_conn->cid); + + mutex_lock(&p_ll2_conn->mutex); + p_ll2_conn->b_active = false; + mutex_unlock(&p_ll2_conn->mutex); +} + +struct qed_ll2_info *qed_ll2_alloc(struct qed_hwfn *p_hwfn) +{ + struct qed_ll2_info *p_ll2_connections; + u8 i; + + /* Allocate LL2's set struct */ + p_ll2_connections = kcalloc(QED_MAX_NUM_OF_LL2_CONNECTIONS, + sizeof(struct qed_ll2_info), GFP_KERNEL); + if (!p_ll2_connections) { + DP_NOTICE(p_hwfn, "Failed to allocate `struct qed_ll2'\n"); + return NULL; + } + + for (i = 0; i < QED_MAX_NUM_OF_LL2_CONNECTIONS; i++) + p_ll2_connections[i].my_id = i; + + return p_ll2_connections; +} + +void qed_ll2_setup(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_connections) +{ + int i; + + for (i = 0; i < QED_MAX_NUM_OF_LL2_CONNECTIONS; i++) + mutex_init(&p_ll2_connections[i].mutex); +} + +void qed_ll2_free(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_connections) +{ + kfree(p_ll2_connections); +} + +static void _qed_ll2_get_tstats(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_ll2_info *p_ll2_conn, + struct qed_ll2_stats *p_stats) +{ + struct core_ll2_tstorm_per_queue_stat tstats; + u8 qid = p_ll2_conn->queue_id; + u32 tstats_addr; + + memset(&tstats, 0, sizeof(tstats)); + tstats_addr = BAR0_MAP_REG_TSDM_RAM + + CORE_LL2_TSTORM_PER_QUEUE_STAT_OFFSET(qid); + qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, sizeof(tstats)); + + p_stats->packet_too_big_discard = + HILO_64_REGPAIR(tstats.packet_too_big_discard); + p_stats->no_buff_discard = HILO_64_REGPAIR(tstats.no_buff_discard); +} + +static void _qed_ll2_get_ustats(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_ll2_info *p_ll2_conn, + struct qed_ll2_stats *p_stats) +{ + struct core_ll2_ustorm_per_queue_stat ustats; + u8 qid = p_ll2_conn->queue_id; + u32 ustats_addr; + + memset(&ustats, 0, sizeof(ustats)); + ustats_addr = BAR0_MAP_REG_USDM_RAM + + CORE_LL2_USTORM_PER_QUEUE_STAT_OFFSET(qid); + qed_memcpy_from(p_hwfn, p_ptt, &ustats, ustats_addr, sizeof(ustats)); + + p_stats->rcv_ucast_bytes = HILO_64_REGPAIR(ustats.rcv_ucast_bytes); + p_stats->rcv_mcast_bytes = HILO_64_REGPAIR(ustats.rcv_mcast_bytes); + p_stats->rcv_bcast_bytes = HILO_64_REGPAIR(ustats.rcv_bcast_bytes); + p_stats->rcv_ucast_pkts = HILO_64_REGPAIR(ustats.rcv_ucast_pkts); + p_stats->rcv_mcast_pkts = HILO_64_REGPAIR(ustats.rcv_mcast_pkts); + p_stats->rcv_bcast_pkts = HILO_64_REGPAIR(ustats.rcv_bcast_pkts); +} + +static void _qed_ll2_get_pstats(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_ll2_info *p_ll2_conn, + struct qed_ll2_stats *p_stats) +{ + struct core_ll2_pstorm_per_queue_stat pstats; + u8 stats_id = p_ll2_conn->tx_stats_id; + u32 pstats_addr; + + memset(&pstats, 0, sizeof(pstats)); + pstats_addr = BAR0_MAP_REG_PSDM_RAM + + CORE_LL2_PSTORM_PER_QUEUE_STAT_OFFSET(stats_id); + qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, sizeof(pstats)); + + p_stats->sent_ucast_bytes = HILO_64_REGPAIR(pstats.sent_ucast_bytes); + p_stats->sent_mcast_bytes = HILO_64_REGPAIR(pstats.sent_mcast_bytes); + p_stats->sent_bcast_bytes = HILO_64_REGPAIR(pstats.sent_bcast_bytes); + p_stats->sent_ucast_pkts = HILO_64_REGPAIR(pstats.sent_ucast_pkts); + p_stats->sent_mcast_pkts = HILO_64_REGPAIR(pstats.sent_mcast_pkts); + p_stats->sent_bcast_pkts = HILO_64_REGPAIR(pstats.sent_bcast_pkts); +} + +int qed_ll2_get_stats(struct qed_hwfn *p_hwfn, + u8 connection_handle, struct qed_ll2_stats *p_stats) +{ + struct qed_ll2_info *p_ll2_conn = NULL; + struct qed_ptt *p_ptt; + + memset(p_stats, 0, sizeof(*p_stats)); + + if ((connection_handle >= QED_MAX_NUM_OF_LL2_CONNECTIONS) || + !p_hwfn->p_ll2_info) + return -EINVAL; + + p_ll2_conn = &p_hwfn->p_ll2_info[connection_handle]; + + p_ptt = qed_ptt_acquire(p_hwfn); + if (!p_ptt) { + DP_ERR(p_hwfn, "Failed to acquire ptt\n"); + return -EINVAL; + } + + _qed_ll2_get_tstats(p_hwfn, p_ptt, p_ll2_conn, p_stats); + _qed_ll2_get_ustats(p_hwfn, p_ptt, p_ll2_conn, p_stats); + if (p_ll2_conn->tx_stats_en) + _qed_ll2_get_pstats(p_hwfn, p_ptt, p_ll2_conn, p_stats); + + qed_ptt_release(p_hwfn, p_ptt); + return 0; +} + +static void qed_ll2_register_cb_ops(struct qed_dev *cdev, + const struct qed_ll2_cb_ops *ops, + void *cookie) +{ + cdev->ll2->cbs = ops; + cdev->ll2->cb_cookie = cookie; +} + +static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params) +{ + struct qed_ll2_info ll2_info; + struct qed_ll2_buffer *buffer; + enum qed_ll2_conn_type conn_type; + struct qed_ptt *p_ptt; + int rc, i; + + /* Initialize LL2 locks & lists */ + INIT_LIST_HEAD(&cdev->ll2->list); + spin_lock_init(&cdev->ll2->lock); + cdev->ll2->rx_size = NET_SKB_PAD + ETH_HLEN + + L1_CACHE_BYTES + params->mtu; + cdev->ll2->frags_mapped = params->frags_mapped; + + /*Allocate memory for LL2 */ + DP_INFO(cdev, "Allocating LL2 buffers of size %08x bytes\n", + cdev->ll2->rx_size); + for (i = 0; i < QED_LL2_RX_SIZE; i++) { + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); + if (!buffer) { + DP_INFO(cdev, "Failed to allocate LL2 buffers\n"); + goto fail; + } + + rc = qed_ll2_alloc_buffer(cdev, (u8 **)&buffer->data, + &buffer->phys_addr); + if (rc) { + kfree(buffer); + goto fail; + } + + list_add_tail(&buffer->list, &cdev->ll2->list); + } + + switch (QED_LEADING_HWFN(cdev)->hw_info.personality) { + case QED_PCI_ISCSI: + conn_type = QED_LL2_TYPE_ISCSI; + break; + case QED_PCI_ETH_ROCE: + conn_type = QED_LL2_TYPE_ROCE; + break; + default: + conn_type = QED_LL2_TYPE_TEST; + } + + /* Prepare the temporary ll2 information */ + memset(&ll2_info, 0, sizeof(ll2_info)); + ll2_info.conn_type = conn_type; + ll2_info.mtu = params->mtu; + ll2_info.rx_drop_ttl0_flg = params->drop_ttl0_packets; + ll2_info.rx_vlan_removal_en = params->rx_vlan_stripping; + ll2_info.tx_tc = 0; + ll2_info.tx_dest = CORE_TX_DEST_NW; + + rc = qed_ll2_acquire_connection(QED_LEADING_HWFN(cdev), &ll2_info, + QED_LL2_RX_SIZE, QED_LL2_TX_SIZE, + &cdev->ll2->handle); + if (rc) { + DP_INFO(cdev, "Failed to acquire LL2 connection\n"); + goto fail; + } + + rc = qed_ll2_establish_connection(QED_LEADING_HWFN(cdev), + cdev->ll2->handle); + if (rc) { + DP_INFO(cdev, "Failed to establish LL2 connection\n"); + goto release_fail; + } + + /* Post all Rx buffers to FW */ + spin_lock_bh(&cdev->ll2->lock); + list_for_each_entry(buffer, &cdev->ll2->list, list) { + rc = qed_ll2_post_rx_buffer(QED_LEADING_HWFN(cdev), + cdev->ll2->handle, + buffer->phys_addr, 0, buffer, 1); + if (rc) { + DP_INFO(cdev, + "Failed to post an Rx buffer; Deleting it\n"); + dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr, + cdev->ll2->rx_size, DMA_FROM_DEVICE); + kfree(buffer->data); + list_del(&buffer->list); + kfree(buffer); + } else { + cdev->ll2->rx_cnt++; + } + } + spin_unlock_bh(&cdev->ll2->lock); + + if (!cdev->ll2->rx_cnt) { + DP_INFO(cdev, "Failed passing even a single Rx buffer\n"); + goto release_terminate; + } + + if (!is_valid_ether_addr(params->ll2_mac_address)) { + DP_INFO(cdev, "Invalid Ethernet address\n"); + goto release_terminate; + } + + p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev)); + if (!p_ptt) { + DP_INFO(cdev, "Failed to acquire PTT\n"); + goto release_terminate; + } + + rc = qed_llh_add_mac_filter(QED_LEADING_HWFN(cdev), p_ptt, + params->ll2_mac_address); + qed_ptt_release(QED_LEADING_HWFN(cdev), p_ptt); + if (rc) { + DP_ERR(cdev, "Failed to allocate LLH filter\n"); + goto release_terminate_all; + } + + ether_addr_copy(cdev->ll2_mac_address, params->ll2_mac_address); + + return 0; + +release_terminate_all: + +release_terminate: + qed_ll2_terminate_connection(QED_LEADING_HWFN(cdev), cdev->ll2->handle); +release_fail: + qed_ll2_release_connection(QED_LEADING_HWFN(cdev), cdev->ll2->handle); +fail: + qed_ll2_kill_buffers(cdev); + cdev->ll2->handle = QED_LL2_UNUSED_HANDLE; + return -EINVAL; +} + +static int qed_ll2_stop(struct qed_dev *cdev) +{ + struct qed_ptt *p_ptt; + int rc; + + if (cdev->ll2->handle == QED_LL2_UNUSED_HANDLE) + return 0; + + p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev)); + if (!p_ptt) { + DP_INFO(cdev, "Failed to acquire PTT\n"); + goto fail; + } + + qed_llh_remove_mac_filter(QED_LEADING_HWFN(cdev), p_ptt, + cdev->ll2_mac_address); + qed_ptt_release(QED_LEADING_HWFN(cdev), p_ptt); + eth_zero_addr(cdev->ll2_mac_address); + + rc = qed_ll2_terminate_connection(QED_LEADING_HWFN(cdev), + cdev->ll2->handle); + if (rc) + DP_INFO(cdev, "Failed to terminate LL2 connection\n"); + + qed_ll2_kill_buffers(cdev); + + qed_ll2_release_connection(QED_LEADING_HWFN(cdev), cdev->ll2->handle); + cdev->ll2->handle = QED_LL2_UNUSED_HANDLE; + + return rc; +fail: + return -EINVAL; +} + +static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb) +{ + const skb_frag_t *frag; + int rc = -EINVAL, i; + dma_addr_t mapping; + u16 vlan = 0; + u8 flags = 0; + + if (unlikely(skb->ip_summed != CHECKSUM_NONE)) { + DP_INFO(cdev, "Cannot transmit a checksumed packet\n"); + return -EINVAL; + } + + if (1 + skb_shinfo(skb)->nr_frags > CORE_LL2_TX_MAX_BDS_PER_PACKET) { + DP_ERR(cdev, "Cannot transmit a packet with %d fragments\n", + 1 + skb_shinfo(skb)->nr_frags); + return -EINVAL; + } + + mapping = dma_map_single(&cdev->pdev->dev, skb->data, + skb->len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(&cdev->pdev->dev, mapping))) { + DP_NOTICE(cdev, "SKB mapping failed\n"); + return -EINVAL; + } + + /* Request HW to calculate IP csum */ + if (!((vlan_get_protocol(skb) == htons(ETH_P_IPV6)) && + ipv6_hdr(skb)->nexthdr == NEXTHDR_IPV6)) + flags |= BIT(CORE_TX_BD_FLAGS_IP_CSUM_SHIFT); + + if (skb_vlan_tag_present(skb)) { + vlan = skb_vlan_tag_get(skb); + flags |= BIT(CORE_TX_BD_FLAGS_VLAN_INSERTION_SHIFT); + } + + rc = qed_ll2_prepare_tx_packet(QED_LEADING_HWFN(cdev), + cdev->ll2->handle, + 1 + skb_shinfo(skb)->nr_frags, + vlan, flags, 0, mapping, + skb->len, skb, 1); + if (rc) + goto err; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + frag = &skb_shinfo(skb)->frags[i]; + if (!cdev->ll2->frags_mapped) { + mapping = skb_frag_dma_map(&cdev->pdev->dev, frag, 0, + skb_frag_size(frag), + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&cdev->pdev->dev, + mapping))) { + DP_NOTICE(cdev, + "Unable to map frag - dropping packet\n"); + goto err; + } + } else { + mapping = page_to_phys(skb_frag_page(frag)) | + frag->page_offset; + } + + rc = qed_ll2_set_fragment_of_tx_packet(QED_LEADING_HWFN(cdev), + cdev->ll2->handle, + mapping, + skb_frag_size(frag)); + + /* if failed not much to do here, partial packet has been posted + * we can't free memory, will need to wait for completion. + */ + if (rc) + goto err2; + } + + return 0; + +err: + dma_unmap_single(&cdev->pdev->dev, mapping, skb->len, DMA_TO_DEVICE); + +err2: + return rc; +} + +static int qed_ll2_stats(struct qed_dev *cdev, struct qed_ll2_stats *stats) +{ + if (!cdev->ll2) + return -EINVAL; + + return qed_ll2_get_stats(QED_LEADING_HWFN(cdev), + cdev->ll2->handle, stats); +} + +const struct qed_ll2_ops qed_ll2_ops_pass = { + .start = &qed_ll2_start, + .stop = &qed_ll2_stop, + .start_xmit = &qed_ll2_start_xmit, + .register_cb_ops = &qed_ll2_register_cb_ops, + .get_stats = &qed_ll2_stats, +}; + +int qed_ll2_alloc_if(struct qed_dev *cdev) +{ + cdev->ll2 = kzalloc(sizeof(*cdev->ll2), GFP_KERNEL); + return cdev->ll2 ? 0 : -ENOMEM; +} + +void qed_ll2_dealloc_if(struct qed_dev *cdev) +{ + kfree(cdev->ll2); + cdev->ll2 = NULL; +} diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h new file mode 100644 index 000000000000..a037c4845928 --- /dev/null +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h @@ -0,0 +1,289 @@ +/* QLogic qed NIC Driver + * + * Copyright (c) 2015 QLogic Corporation + * + * This software is available under the terms of the GNU General Public License + * (GPL) Version 2, available from the file COPYING in the main directory of + * this source tree. + */ + +#ifndef _QED_LL2_H +#define _QED_LL2_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include "qed.h" +#include "qed_hsi.h" +#include "qed_sp.h" + +#define QED_MAX_NUM_OF_LL2_CONNECTIONS (4) + +enum qed_ll2_conn_type { + QED_LL2_TYPE_RESERVED, + QED_LL2_TYPE_ISCSI, + QED_LL2_TYPE_TEST, + QED_LL2_TYPE_ISCSI_OOO, + QED_LL2_TYPE_RESERVED2, + QED_LL2_TYPE_ROCE, + QED_LL2_TYPE_RESERVED3, + MAX_QED_LL2_RX_CONN_TYPE +}; + +struct qed_ll2_rx_packet { + struct list_head list_entry; + struct core_rx_bd_with_buff_len *rxq_bd; + dma_addr_t rx_buf_addr; + u16 buf_length; + void *cookie; + u8 placement_offset; + u16 parse_flags; + u16 packet_length; + u16 vlan; + u32 opaque_data[2]; +}; + +struct qed_ll2_tx_packet { + struct list_head list_entry; + u16 bd_used; + u16 vlan; + u16 l4_hdr_offset_w; + u8 bd_flags; + bool notify_fw; + void *cookie; + + struct { + struct core_tx_bd *txq_bd; + dma_addr_t tx_frag; + u16 frag_len; + } bds_set[ETH_TX_MAX_BDS_PER_NON_LSO_PACKET]; +}; + +struct qed_ll2_rx_queue { + /* Lock protecting the Rx queue manipulation */ + spinlock_t lock; + struct qed_chain rxq_chain; + struct qed_chain rcq_chain; + u8 rx_sb_index; + bool b_cb_registred; + __le16 *p_fw_cons; + struct list_head active_descq; + struct list_head free_descq; + struct list_head posting_descq; + struct qed_ll2_rx_packet *descq_array; + void __iomem *set_prod_addr; +}; + +struct qed_ll2_tx_queue { + /* Lock protecting the Tx queue manipulation */ + spinlock_t lock; + struct qed_chain txq_chain; + u8 tx_sb_index; + bool b_cb_registred; + __le16 *p_fw_cons; + struct list_head active_descq; + struct list_head free_descq; + struct list_head sending_descq; + struct qed_ll2_tx_packet *descq_array; + struct qed_ll2_tx_packet *cur_send_packet; + struct qed_ll2_tx_packet cur_completing_packet; + u16 cur_completing_bd_idx; + void __iomem *doorbell_addr; + u16 bds_idx; + u16 cur_send_frag_num; + u16 cur_completing_frag_num; + bool b_completing_packet; +}; + +struct qed_ll2_info { + /* Lock protecting the state of LL2 */ + struct mutex mutex; + enum qed_ll2_conn_type conn_type; + u32 cid; + u8 my_id; + u8 queue_id; + u8 tx_stats_id; + bool b_active; + u16 mtu; + u8 rx_drop_ttl0_flg; + u8 rx_vlan_removal_en; + u8 tx_tc; + enum core_tx_dest tx_dest; + enum core_error_handle ai_err_packet_too_big; + enum core_error_handle ai_err_no_buf; + u8 tx_stats_en; + struct qed_ll2_rx_queue rx_queue; + struct qed_ll2_tx_queue tx_queue; +}; + +/** + * @brief qed_ll2_acquire_connection - allocate resources, + * starts rx & tx (if relevant) queues pair. Provides + * connecion handler as output parameter. + * + * @param p_hwfn + * @param p_params Contain various configuration properties + * @param rx_num_desc + * @param tx_num_desc + * + * @param p_connection_handle Output container for LL2 connection's handle + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_acquire_connection(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_params, + u16 rx_num_desc, + u16 tx_num_desc, + u8 *p_connection_handle); + +/** + * @brief qed_ll2_establish_connection - start previously + * allocated LL2 queues pair + * + * @param p_hwfn + * @param p_ptt + * @param connection_handle LL2 connection's handle obtained from + * qed_ll2_require_connection + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_establish_connection(struct qed_hwfn *p_hwfn, u8 connection_handle); + +/** + * @brief qed_ll2_post_rx_buffers - submit buffers to LL2 Rx queue. + * + * @param p_hwfn + * @param connection_handle LL2 connection's handle obtained from + * qed_ll2_require_connection + * @param addr rx (physical address) buffers to submit + * @param cookie + * @param notify_fw produce corresponding Rx BD immediately + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_post_rx_buffer(struct qed_hwfn *p_hwfn, + u8 connection_handle, + dma_addr_t addr, + u16 buf_len, void *cookie, u8 notify_fw); + +/** + * @brief qed_ll2_prepare_tx_packet - request for start Tx BD + * to prepare Tx packet submission to FW. + * + * @param p_hwfn + * @param connection_handle LL2 connection's handle obtained from + * qed_ll2_require_connection + * @param num_of_bds a number of requested BD equals a number of + * fragments in Tx packet + * @param vlan VLAN to insert to packet (if insertion set) + * @param bd_flags + * @param l4_hdr_offset_w L4 Header Offset from start of packet + * (in words). This is needed if both l4_csum + * and ipv6_ext are set + * @param first_frag + * @param first_frag_len + * @param cookie + * + * @param notify_fw + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_prepare_tx_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + u8 num_of_bds, + u16 vlan, + u8 bd_flags, + u16 l4_hdr_offset_w, + dma_addr_t first_frag, + u16 first_frag_len, void *cookie, u8 notify_fw); + +/** + * @brief qed_ll2_release_connection - releases resources + * allocated for LL2 connection + * + * @param p_hwfn + * @param connection_handle LL2 connection's handle obtained from + * qed_ll2_require_connection + */ +void qed_ll2_release_connection(struct qed_hwfn *p_hwfn, u8 connection_handle); + +/** + * @brief qed_ll2_set_fragment_of_tx_packet - provides fragments to fill + * Tx BD of BDs requested by + * qed_ll2_prepare_tx_packet + * + * @param p_hwfn + * @param connection_handle LL2 connection's handle + * obtained from + * qed_ll2_require_connection + * @param addr + * @param nbytes + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_set_fragment_of_tx_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + dma_addr_t addr, u16 nbytes); + +/** + * @brief qed_ll2_terminate_connection - stops Tx/Rx queues + * + * + * @param p_hwfn + * @param connection_handle LL2 connection's handle + * obtained from + * qed_ll2_require_connection + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_terminate_connection(struct qed_hwfn *p_hwfn, u8 connection_handle); + +/** + * @brief qed_ll2_get_stats - get LL2 queue's statistics + * + * + * @param p_hwfn + * @param connection_handle LL2 connection's handle obtained from + * qed_ll2_require_connection + * @param p_stats + * + * @return 0 on success, failure otherwise + */ +int qed_ll2_get_stats(struct qed_hwfn *p_hwfn, + u8 connection_handle, struct qed_ll2_stats *p_stats); + +/** + * @brief qed_ll2_alloc - Allocates LL2 connections set + * + * @param p_hwfn + * + * @return pointer to alocated qed_ll2_info or NULL + */ +struct qed_ll2_info *qed_ll2_alloc(struct qed_hwfn *p_hwfn); + +/** + * @brief qed_ll2_setup - Inits LL2 connections set + * + * @param p_hwfn + * @param p_ll2_connections + * + */ +void qed_ll2_setup(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_connections); + +/** + * @brief qed_ll2_free - Releases LL2 connections set + * + * @param p_hwfn + * @param p_ll2_connections + * + */ +void qed_ll2_free(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_connections); + +#endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index b730a632c383..48cdf62c025b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -22,11 +22,13 @@ #include #include #include +#include #include "qed.h" #include "qed_sriov.h" #include "qed_sp.h" #include "qed_dev_api.h" +#include "qed_ll2.h" #include "qed_mcp.h" #include "qed_hw.h" #include "qed_selftest.h" @@ -608,7 +610,16 @@ static int qed_nic_reset(struct qed_dev *cdev) static int qed_nic_setup(struct qed_dev *cdev) { - int rc; + int rc, i; + + /* Determine if interface is going to require LL2 */ + if (QED_LEADING_HWFN(cdev)->hw_info.personality != QED_PCI_ETH) { + for (i = 0; i < cdev->num_hwfns; i++) { + struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; + + p_hwfn->using_ll2 = true; + } + } rc = qed_resc_alloc(cdev); if (rc) @@ -873,6 +884,12 @@ static int qed_slowpath_start(struct qed_dev *cdev, DP_INFO(cdev, "HW initialization and function start completed successfully\n"); + /* Allocate LL2 interface if needed */ + if (QED_LEADING_HWFN(cdev)->using_ll2) { + rc = qed_ll2_alloc_if(cdev); + if (rc) + goto err3; + } if (IS_PF(cdev)) { hwfn = QED_LEADING_HWFN(cdev); drv_version.version = (params->drv_major << 24) | @@ -893,6 +910,8 @@ static int qed_slowpath_start(struct qed_dev *cdev, return 0; +err3: + qed_hw_stop(cdev); err2: qed_hw_timers_stop_all(cdev); if (IS_PF(cdev)) @@ -915,6 +934,8 @@ static int qed_slowpath_stop(struct qed_dev *cdev) if (!cdev) return -ENODEV; + qed_ll2_dealloc_if(cdev); + if (IS_PF(cdev)) { qed_free_stream_mem(cdev); if (IS_QED_ETH_IF(cdev)) diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h index 759cb04e02b0..e75738d21783 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h +++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h @@ -208,6 +208,26 @@ 0x50196cUL #define NIG_REG_LLH_CLS_TYPE_DUALMODE \ 0x501964UL +#define NIG_REG_LLH_FUNC_FILTER_VALUE \ + 0x501a00UL +#define NIG_REG_LLH_FUNC_FILTER_VALUE_SIZE \ + 32 +#define NIG_REG_LLH_FUNC_FILTER_EN \ + 0x501a80UL +#define NIG_REG_LLH_FUNC_FILTER_EN_SIZE \ + 16 +#define NIG_REG_LLH_FUNC_FILTER_MODE \ + 0x501ac0UL +#define NIG_REG_LLH_FUNC_FILTER_MODE_SIZE \ + 16 +#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE \ + 0x501b00UL +#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_SIZE \ + 16 +#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL \ + 0x501b40UL +#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_SIZE \ + 16 #define NCSI_REG_CONFIG \ 0x040200UL #define PBF_REG_INIT \ @@ -264,6 +284,8 @@ 0x1f0a1cUL #define PRS_REG_ROCE_DEST_QP_MAX_PF \ 0x1f0430UL +#define PRS_REG_USE_LIGHT_L2 \ + 0x1f096cUL #define PSDM_REG_ENABLE_IN1 \ 0xfa0004UL #define PSEM_REG_ENABLE_IN \ diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h index a548504c3420..a3c539f1c2ac 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h @@ -61,6 +61,10 @@ union ramrod_data { struct vport_start_ramrod_data vport_start; struct vport_stop_ramrod_data vport_stop; struct vport_update_ramrod_data vport_update; + struct core_rx_start_ramrod_data core_rx_queue_start; + struct core_rx_stop_ramrod_data core_rx_queue_stop; + struct core_tx_start_ramrod_data core_tx_queue_start; + struct core_tx_stop_ramrod_data core_tx_queue_stop; struct vport_filter_update_ramrod_data vport_filter_update; struct rdma_init_func_ramrod_data rdma_init_func; diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index e4546abcea08..c2d74e8785cf 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -627,6 +627,7 @@ enum DP_MODULE { QED_MSG_SP = 0x100000, QED_MSG_STORAGE = 0x200000, QED_MSG_CXT = 0x800000, + QED_MSG_LL2 = 0x1000000, QED_MSG_ILT = 0x2000000, QED_MSG_ROCE = 0x4000000, QED_MSG_DEBUG = 0x8000000, diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h new file mode 100644 index 000000000000..fd75c265dba3 --- /dev/null +++ b/include/linux/qed/qed_ll2_if.h @@ -0,0 +1,139 @@ +/* QLogic qed NIC Driver + * + * Copyright (c) 2015 QLogic Corporation + * + * This software is available under the terms of the GNU General Public License + * (GPL) Version 2, available from the file COPYING in the main directory of + * this source tree. + */ + +#ifndef _QED_LL2_IF_H +#define _QED_LL2_IF_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct qed_ll2_stats { + u64 gsi_invalid_hdr; + u64 gsi_invalid_pkt_length; + u64 gsi_unsupported_pkt_typ; + u64 gsi_crcchksm_error; + + u64 packet_too_big_discard; + u64 no_buff_discard; + + u64 rcv_ucast_bytes; + u64 rcv_mcast_bytes; + u64 rcv_bcast_bytes; + u64 rcv_ucast_pkts; + u64 rcv_mcast_pkts; + u64 rcv_bcast_pkts; + + u64 sent_ucast_bytes; + u64 sent_mcast_bytes; + u64 sent_bcast_bytes; + u64 sent_ucast_pkts; + u64 sent_mcast_pkts; + u64 sent_bcast_pkts; +}; + +#define QED_LL2_UNUSED_HANDLE (0xff) + +struct qed_ll2_cb_ops { + int (*rx_cb)(void *, struct sk_buff *, u32, u32); + int (*tx_cb)(void *, struct sk_buff *, bool); +}; + +struct qed_ll2_params { + u16 mtu; + bool drop_ttl0_packets; + bool rx_vlan_stripping; + u8 tx_tc; + bool frags_mapped; + u8 ll2_mac_address[ETH_ALEN]; +}; + +struct qed_ll2_ops { +/** + * @brief start - initializes ll2 + * + * @param cdev + * @param params - protocol driver configuration for the ll2. + * + * @return 0 on success, otherwise error value. + */ + int (*start)(struct qed_dev *cdev, struct qed_ll2_params *params); + +/** + * @brief stop - stops the ll2 + * + * @param cdev + * + * @return 0 on success, otherwise error value. + */ + int (*stop)(struct qed_dev *cdev); + +/** + * @brief start_xmit - transmits an skb over the ll2 interface + * + * @param cdev + * @param skb + * + * @return 0 on success, otherwise error value. + */ + int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb); + +/** + * @brief register_cb_ops - protocol driver register the callback for Rx/Tx + * packets. Should be called before `start'. + * + * @param cdev + * @param cookie - to be passed to the callback functions. + * @param ops - the callback functions to register for Rx / Tx. + * + * @return 0 on success, otherwise error value. + */ + void (*register_cb_ops)(struct qed_dev *cdev, + const struct qed_ll2_cb_ops *ops, + void *cookie); + +/** + * @brief get LL2 related statistics + * + * @param cdev + * @param stats - pointer to struct that would be filled with stats + * + * @return 0 on success, error otherwise. + */ + int (*get_stats)(struct qed_dev *cdev, struct qed_ll2_stats *stats); +}; + +#ifdef CONFIG_QED_LL2 +int qed_ll2_alloc_if(struct qed_dev *); +void qed_ll2_dealloc_if(struct qed_dev *); +#else +static const struct qed_ll2_ops qed_ll2_ops_pass = { + .start = NULL, + .stop = NULL, + .start_xmit = NULL, + .register_cb_ops = NULL, + .get_stats = NULL, +}; + +static inline int qed_ll2_alloc_if(struct qed_dev *cdev) +{ + return 0; +} + +static inline void qed_ll2_dealloc_if(struct qed_dev *cdev) +{ +} +#endif +#endif -- cgit v1.2.3 From cee9fbd8e2e9e713cd8bf227c6492fd8854de74b Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:56 +0300 Subject: qede: Add qedr framework Adds a skeletal implementation of the qede RoCE driver - The qedr has some dependencies of the state of the underlying base interface. This adds some logic required with mutual registrations and the ability to pass updates on 'intresting' events. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/Kconfig | 18 +- drivers/net/ethernet/qlogic/qede/Makefile | 1 + drivers/net/ethernet/qlogic/qede/qede.h | 9 + drivers/net/ethernet/qlogic/qede/qede_main.c | 35 ++- drivers/net/ethernet/qlogic/qede/qede_roce.c | 314 +++++++++++++++++++++++++++ include/linux/qed/qed_if.h | 3 +- include/linux/qed/qede_roce.h | 88 ++++++++ 7 files changed, 451 insertions(+), 17 deletions(-) create mode 100644 drivers/net/ethernet/qlogic/qede/qede_roce.c create mode 100644 include/linux/qed/qede_roce.h (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 9eb3b1914cf5..0df1391f9663 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -89,12 +89,7 @@ config QED This enables the support for ... config QED_LL2 - bool "Qlogic QED Light L2 interface" - default n - depends on QED - ---help--- - This enables support for Light L2 interface which is required - by all qed protocol drivers other than qede. + bool config QED_SRIOV bool "QLogic QED 25/40/100Gb SR-IOV support" @@ -112,4 +107,15 @@ config QEDE ---help--- This enables the support for ... +config INFINIBAND_QEDR + tristate "QLogic qede RoCE sources [debug]" + depends on QEDE && 64BIT + select QED_LL2 + default n + ---help--- + This provides a temporary node that allows the compilation + and logical testing of the InfiniBand over Ethernet support + for QLogic QED. This would be replaced by the 'real' option + once the QEDR driver is added [+relocated]. + endif # NET_VENDOR_QLOGIC diff --git a/drivers/net/ethernet/qlogic/qede/Makefile b/drivers/net/ethernet/qlogic/qede/Makefile index 74a49850d74d..28dc58919c85 100644 --- a/drivers/net/ethernet/qlogic/qede/Makefile +++ b/drivers/net/ethernet/qlogic/qede/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_QEDE) := qede.o qede-y := qede_main.o qede_ethtool.o qede-$(CONFIG_DCB) += qede_dcbnl.o +qede-$(CONFIG_INFINIBAND_QEDR) += qede_roce.o diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index e01adce4a966..28c0e9f42c9e 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -106,6 +106,13 @@ struct qede_vlan { bool configured; }; +struct qede_rdma_dev { + struct qedr_dev *qedr_dev; + struct list_head entry; + struct list_head roce_event_list; + struct workqueue_struct *roce_wq; +}; + struct qede_dev { struct qed_dev *cdev; struct net_device *ndev; @@ -185,6 +192,8 @@ struct qede_dev { unsigned long sp_flags; u16 vxlan_dst_port; u16 geneve_dst_port; + + struct qede_rdma_dev rdma_info; }; enum QEDE_STATE { diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 0e198fe89d1a..343038ca047d 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -36,7 +36,7 @@ #include #include #include - +#include #include "qede.h" static char version[] = @@ -193,8 +193,7 @@ static int qede_netdev_event(struct notifier_block *this, unsigned long event, struct ethtool_drvinfo drvinfo; struct qede_dev *edev; - /* Currently only support name change */ - if (event != NETDEV_CHANGENAME) + if (event != NETDEV_CHANGENAME && event != NETDEV_CHANGEADDR) goto done; /* Check whether this is a qede device */ @@ -207,11 +206,18 @@ static int qede_netdev_event(struct notifier_block *this, unsigned long event, goto done; edev = netdev_priv(ndev); - /* Notify qed of the name change */ - if (!edev->ops || !edev->ops->common) - goto done; - edev->ops->common->set_id(edev->cdev, edev->ndev->name, - "qede"); + switch (event) { + case NETDEV_CHANGENAME: + /* Notify qed of the name change */ + if (!edev->ops || !edev->ops->common) + goto done; + edev->ops->common->set_id(edev->cdev, edev->ndev->name, "qede"); + break; + case NETDEV_CHANGEADDR: + edev = netdev_priv(ndev); + qede_roce_event_changeaddr(edev); + break; + } done: return NOTIFY_DONE; @@ -2545,10 +2551,14 @@ static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level, qede_init_ndev(edev); + rc = qede_roce_dev_add(edev); + if (rc) + goto err3; + rc = register_netdev(edev->ndev); if (rc) { DP_NOTICE(edev, "Cannot register net-device\n"); - goto err3; + goto err4; } edev->ops->common->set_id(cdev, edev->ndev->name, DRV_MODULE_VERSION); @@ -2568,6 +2578,8 @@ static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level, return 0; +err4: + qede_roce_dev_remove(edev); err3: free_netdev(edev->ndev); err2: @@ -2614,8 +2626,11 @@ static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode) DP_INFO(edev, "Starting qede_remove\n"); cancel_delayed_work_sync(&edev->sp_task); + unregister_netdev(ndev); + qede_roce_dev_remove(edev); + edev->ops->common->set_power_state(cdev, PCI_D0); pci_set_drvdata(pdev, NULL); @@ -3512,6 +3527,7 @@ static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode) DP_INFO(edev, "Starting qede unload\n"); + qede_roce_dev_event_close(edev); mutex_lock(&edev->qede_lock); edev->state = QEDE_STATE_CLOSED; @@ -3612,6 +3628,7 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode) /* Query whether link is already-up */ memset(&link_output, 0, sizeof(link_output)); edev->ops->common->get_link(edev->cdev, &link_output); + qede_roce_dev_event_open(edev); qede_link_update(edev, &link_output); DP_INFO(edev, "Ending successfully qede load\n"); diff --git a/drivers/net/ethernet/qlogic/qede/qede_roce.c b/drivers/net/ethernet/qlogic/qede/qede_roce.c new file mode 100644 index 000000000000..9867f960b063 --- /dev/null +++ b/drivers/net/ethernet/qlogic/qede/qede_roce.c @@ -0,0 +1,314 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include "qede.h" + +static struct qedr_driver *qedr_drv; +static LIST_HEAD(qedr_dev_list); +static DEFINE_MUTEX(qedr_dev_list_lock); + +bool qede_roce_supported(struct qede_dev *dev) +{ + return dev->dev_info.common.rdma_supported; +} + +static void _qede_roce_dev_add(struct qede_dev *edev) +{ + if (!qedr_drv) + return; + + edev->rdma_info.qedr_dev = qedr_drv->add(edev->cdev, edev->pdev, + edev->ndev); +} + +static int qede_roce_create_wq(struct qede_dev *edev) +{ + INIT_LIST_HEAD(&edev->rdma_info.roce_event_list); + edev->rdma_info.roce_wq = create_singlethread_workqueue("roce_wq"); + if (!edev->rdma_info.roce_wq) { + DP_NOTICE(edev, "qedr: Could not create workqueue\n"); + return -ENOMEM; + } + + return 0; +} + +static void qede_roce_cleanup_event(struct qede_dev *edev) +{ + struct list_head *head = &edev->rdma_info.roce_event_list; + struct qede_roce_event_work *event_node; + + flush_workqueue(edev->rdma_info.roce_wq); + while (!list_empty(head)) { + event_node = list_entry(head->next, struct qede_roce_event_work, + list); + cancel_work_sync(&event_node->work); + list_del(&event_node->list); + kfree(event_node); + } +} + +static void qede_roce_destroy_wq(struct qede_dev *edev) +{ + qede_roce_cleanup_event(edev); + destroy_workqueue(edev->rdma_info.roce_wq); +} + +int qede_roce_dev_add(struct qede_dev *edev) +{ + int rc = 0; + + if (qede_roce_supported(edev)) { + rc = qede_roce_create_wq(edev); + if (rc) + return rc; + + INIT_LIST_HEAD(&edev->rdma_info.entry); + mutex_lock(&qedr_dev_list_lock); + list_add_tail(&edev->rdma_info.entry, &qedr_dev_list); + _qede_roce_dev_add(edev); + mutex_unlock(&qedr_dev_list_lock); + } + + return rc; +} + +static void _qede_roce_dev_remove(struct qede_dev *edev) +{ + if (qedr_drv && qedr_drv->remove && edev->rdma_info.qedr_dev) + qedr_drv->remove(edev->rdma_info.qedr_dev); + edev->rdma_info.qedr_dev = NULL; +} + +void qede_roce_dev_remove(struct qede_dev *edev) +{ + if (!qede_roce_supported(edev)) + return; + + qede_roce_destroy_wq(edev); + mutex_lock(&qedr_dev_list_lock); + _qede_roce_dev_remove(edev); + list_del(&edev->rdma_info.entry); + mutex_unlock(&qedr_dev_list_lock); +} + +static void _qede_roce_dev_open(struct qede_dev *edev) +{ + if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify) + qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_UP); +} + +static void qede_roce_dev_open(struct qede_dev *edev) +{ + if (!qede_roce_supported(edev)) + return; + + mutex_lock(&qedr_dev_list_lock); + _qede_roce_dev_open(edev); + mutex_unlock(&qedr_dev_list_lock); +} + +static void _qede_roce_dev_close(struct qede_dev *edev) +{ + if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify) + qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_DOWN); +} + +static void qede_roce_dev_close(struct qede_dev *edev) +{ + if (!qede_roce_supported(edev)) + return; + + mutex_lock(&qedr_dev_list_lock); + _qede_roce_dev_close(edev); + mutex_unlock(&qedr_dev_list_lock); +} + +static void qede_roce_dev_shutdown(struct qede_dev *edev) +{ + if (!qede_roce_supported(edev)) + return; + + mutex_lock(&qedr_dev_list_lock); + if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify) + qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_CLOSE); + mutex_unlock(&qedr_dev_list_lock); +} + +int qede_roce_register_driver(struct qedr_driver *drv) +{ + struct qede_dev *edev; + u8 qedr_counter = 0; + + mutex_lock(&qedr_dev_list_lock); + if (qedr_drv) { + mutex_unlock(&qedr_dev_list_lock); + return -EINVAL; + } + qedr_drv = drv; + + list_for_each_entry(edev, &qedr_dev_list, rdma_info.entry) { + struct net_device *ndev; + + qedr_counter++; + _qede_roce_dev_add(edev); + ndev = edev->ndev; + if (netif_running(ndev) && netif_oper_up(ndev)) + _qede_roce_dev_open(edev); + } + mutex_unlock(&qedr_dev_list_lock); + + DP_INFO(edev, "qedr: discovered and registered %d RoCE funcs\n", + qedr_counter); + + return 0; +} +EXPORT_SYMBOL(qede_roce_register_driver); + +void qede_roce_unregister_driver(struct qedr_driver *drv) +{ + struct qede_dev *edev; + + mutex_lock(&qedr_dev_list_lock); + list_for_each_entry(edev, &qedr_dev_list, rdma_info.entry) { + if (edev->rdma_info.qedr_dev) + _qede_roce_dev_remove(edev); + } + qedr_drv = NULL; + mutex_unlock(&qedr_dev_list_lock); +} +EXPORT_SYMBOL(qede_roce_unregister_driver); + +static void qede_roce_changeaddr(struct qede_dev *edev) +{ + if (!qede_roce_supported(edev)) + return; + + if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify) + qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_CHANGE_ADDR); +} + +struct qede_roce_event_work *qede_roce_get_free_event_node(struct qede_dev + *edev) +{ + struct qede_roce_event_work *event_node = NULL; + struct list_head *list_node = NULL; + bool found = false; + + list_for_each(list_node, &edev->rdma_info.roce_event_list) { + event_node = list_entry(list_node, struct qede_roce_event_work, + list); + if (!work_pending(&event_node->work)) { + found = true; + break; + } + } + + if (!found) { + event_node = kzalloc(sizeof(*event_node), GFP_KERNEL); + if (!event_node) { + DP_NOTICE(edev, + "qedr: Could not allocate memory for roce work\n"); + return NULL; + } + list_add_tail(&event_node->list, + &edev->rdma_info.roce_event_list); + } + + return event_node; +} + +static void qede_roce_handle_event(struct work_struct *work) +{ + struct qede_roce_event_work *event_node; + enum qede_roce_event event; + struct qede_dev *edev; + + event_node = container_of(work, struct qede_roce_event_work, work); + event = event_node->event; + edev = event_node->ptr; + + switch (event) { + case QEDE_UP: + qede_roce_dev_open(edev); + break; + case QEDE_DOWN: + qede_roce_dev_close(edev); + break; + case QEDE_CLOSE: + qede_roce_dev_shutdown(edev); + break; + case QEDE_CHANGE_ADDR: + qede_roce_changeaddr(edev); + break; + default: + DP_NOTICE(edev, "Invalid roce event %d", event); + } +} + +static void qede_roce_add_event(struct qede_dev *edev, + enum qede_roce_event event) +{ + struct qede_roce_event_work *event_node; + + if (!edev->rdma_info.qedr_dev) + return; + + event_node = qede_roce_get_free_event_node(edev); + if (!event_node) + return; + + event_node->event = event; + event_node->ptr = edev; + + INIT_WORK(&event_node->work, qede_roce_handle_event); + queue_work(edev->rdma_info.roce_wq, &event_node->work); +} + +void qede_roce_dev_event_open(struct qede_dev *edev) +{ + qede_roce_add_event(edev, QEDE_UP); +} + +void qede_roce_dev_event_close(struct qede_dev *edev) +{ + qede_roce_add_event(edev, QEDE_DOWN); +} + +void qede_roce_event_changeaddr(struct qede_dev *edev) +{ + qede_roce_add_event(edev, QEDE_CHANGE_ADDR); +} diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index c2d74e8785cf..e313742b571d 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -260,11 +260,10 @@ struct qed_dev_info { /* MFW version */ u32 mfw_rev; - bool rdma_supported; - u32 flash_size; u8 mf_mode; bool tx_switching; + bool rdma_supported; }; enum qed_sb_type { diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_roce.h new file mode 100644 index 000000000000..99fbe6d55acb --- /dev/null +++ b/include/linux/qed/qede_roce.h @@ -0,0 +1,88 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef QEDE_ROCE_H +#define QEDE_ROCE_H + +struct qedr_dev; +struct qed_dev; +struct qede_dev; + +enum qede_roce_event { + QEDE_UP, + QEDE_DOWN, + QEDE_CHANGE_ADDR, + QEDE_CLOSE +}; + +struct qede_roce_event_work { + struct list_head list; + struct work_struct work; + void *ptr; + enum qede_roce_event event; +}; + +struct qedr_driver { + unsigned char name[32]; + + struct qedr_dev* (*add)(struct qed_dev *, struct pci_dev *, + struct net_device *); + + void (*remove)(struct qedr_dev *); + void (*notify)(struct qedr_dev *, enum qede_roce_event); +}; + +/* APIs for RoCE driver to register callback handlers, + * which will be invoked when device is added, removed, ifup, ifdown + */ +int qede_roce_register_driver(struct qedr_driver *drv); +void qede_roce_unregister_driver(struct qedr_driver *drv); + +bool qede_roce_supported(struct qede_dev *dev); + +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +int qede_roce_dev_add(struct qede_dev *dev); +void qede_roce_dev_event_open(struct qede_dev *dev); +void qede_roce_dev_event_close(struct qede_dev *dev); +void qede_roce_dev_remove(struct qede_dev *dev); +void qede_roce_event_changeaddr(struct qede_dev *qedr); +#else +static inline int qede_roce_dev_add(struct qede_dev *dev) +{ + return 0; +} + +static inline void qede_roce_dev_event_open(struct qede_dev *dev) {} +static inline void qede_roce_dev_event_close(struct qede_dev *dev) {} +static inline void qede_roce_dev_remove(struct qede_dev *dev) {} +static inline void qede_roce_event_changeaddr(struct qede_dev *qedr) {} +#endif +#endif -- cgit v1.2.3 From 51ff17251c9c2c2e71974149d22bc73ea09c27cc Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:57 +0300 Subject: qed: Add support for RoCE hw init This adds the backbone required for the various HW initalizations which are necessary for the qedr driver - FW notification, resource initializations, etc. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/Makefile | 1 + drivers/net/ethernet/qlogic/qed/qed.h | 33 +- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 6 + drivers/net/ethernet/qlogic/qed/qed_cxt.h | 6 + drivers/net/ethernet/qlogic/qed/qed_dev.c | 154 +++++ drivers/net/ethernet/qlogic/qed/qed_main.c | 44 +- drivers/net/ethernet/qlogic/qed/qed_reg_addr.h | 8 +- drivers/net/ethernet/qlogic/qed/qed_roce.c | 886 +++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_roce.h | 123 ++++ drivers/net/ethernet/qlogic/qed/qed_sp.h | 1 + drivers/net/ethernet/qlogic/qed/qed_spq.c | 8 + drivers/net/ethernet/qlogic/qed/qed_sriov.c | 4 +- drivers/net/ethernet/qlogic/qed/qed_vf.c | 2 +- include/linux/qed/common_hsi.h | 1 + include/linux/qed/qed_if.h | 5 +- include/linux/qed/qed_roce_if.h | 345 ++++++++++ include/linux/qed/rdma_common.h | 1 + 17 files changed, 1620 insertions(+), 8 deletions(-) create mode 100644 drivers/net/ethernet/qlogic/qed/qed_roce.c create mode 100644 drivers/net/ethernet/qlogic/qed/qed_roce.h create mode 100644 include/linux/qed/qed_roce_if.h (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile index e067098f10a9..cda0af7fbc20 100644 --- a/drivers/net/ethernet/qlogic/qed/Makefile +++ b/drivers/net/ethernet/qlogic/qed/Makefile @@ -5,3 +5,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \ qed_selftest.o qed_dcbx.o qed_debug.o qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o qed-$(CONFIG_QED_LL2) += qed_ll2.o +qed-$(CONFIG_INFINIBAND_QEDR) += qed_roce.o diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index 91b571a3670b..c5a098a2ca2c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -35,6 +35,9 @@ extern const struct qed_common_ops qed_common_ops_pass; #define QED_WFQ_UNIT 100 +#define QED_WID_SIZE (1024) +#define QED_PF_DEMS_SIZE (4) + /* cau states */ enum qed_coalescing_mode { QED_COAL_MODE_DISABLE, @@ -48,6 +51,14 @@ enum qed_mcp_protocol_type; /* helpers */ static inline u32 qed_db_addr(u32 cid, u32 DEMS) +{ + u32 db_addr = FIELD_VALUE(DB_LEGACY_ADDR_DEMS, DEMS) | + (cid * QED_PF_DEMS_SIZE); + + return db_addr; +} + +static inline u32 qed_db_addr_vf(u32 cid, u32 DEMS) { u32 db_addr = FIELD_VALUE(DB_LEGACY_ADDR_DEMS, DEMS) | FIELD_VALUE(DB_LEGACY_ADDR_ICID, cid); @@ -152,14 +163,17 @@ enum QED_RESOURCES { QED_RL, QED_MAC, QED_VLAN, + QED_RDMA_CNQ_RAM, QED_ILT, QED_LL2_QUEUE, + QED_RDMA_STATS_QUEUE, QED_MAX_RESC, }; enum QED_FEATURE { QED_PF_L2_QUE, QED_VF, + QED_RDMA_CNQ, QED_MAX_FEATURES, }; @@ -364,6 +378,7 @@ struct qed_hwfn { /* Protocol related */ bool using_ll2; struct qed_ll2_info *p_ll2_info; + struct qed_rdma_info *p_rdma_info; struct qed_pf_params pf_params; bool b_rdma_enabled_in_prs; @@ -402,6 +417,17 @@ struct qed_hwfn { struct dbg_tools_data dbg_info; + /* PWM region specific data */ + u32 dpi_size; + u32 dpi_count; + + /* This is used to calculate the doorbell address */ + u32 dpi_start_offset; + + /* If one of the following is set then EDPM shouldn't be used */ + u8 dcbx_no_edpm; + u8 db_bar_no_edpm; + struct qed_simd_fp_handler simd_proto_handler[64]; #ifdef CONFIG_QED_SRIOV @@ -435,6 +461,8 @@ struct qed_int_params { bool fp_initialized; u8 fp_msix_base; u8 fp_msix_cnt; + u8 rdma_msix_base; + u8 rdma_msix_cnt; }; struct qed_dbg_feature { @@ -541,7 +569,6 @@ struct qed_dev { bool b_is_vf; u32 drv_type; - struct qed_eth_stats *reset_stats; struct qed_fw_data *fw_data; @@ -574,6 +601,10 @@ struct qed_dev { #endif const struct firmware *firmware; + + u32 rdma_max_sge; + u32 rdma_max_inline; + u32 rdma_max_srq_sge; }; #define NUM_OF_VFS(dev) MAX_NUM_VFS_BB diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index d9bea2a9c9f7..82370a1a59ad 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -48,7 +48,13 @@ #define TM_ELEM_SIZE 4 /* ILT constants */ +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +/* For RoCE we configure to 64K to cover for RoCE max tasks 256K purpose. */ +#define ILT_DEFAULT_HW_P_SIZE 4 +#else #define ILT_DEFAULT_HW_P_SIZE 3 +#endif + #define ILT_PAGE_IN_BYTES(hw_p_size) (1U << ((hw_p_size) + 12)) #define ILT_CFG_REG(cli, reg) PSWRQ2_REG_ ## cli ## _ ## reg ## _RT_OFFSET diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h index c6f6f2e8192d..d00ad055802b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h @@ -170,6 +170,12 @@ int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); */ void qed_cxt_release_cid(struct qed_hwfn *p_hwfn, u32 cid); +int qed_cxt_dynamic_ilt_alloc(struct qed_hwfn *p_hwfn, + enum qed_cxt_elem_type elem_type, u32 iid); +u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn, + enum protocol_type type); +u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn, + enum protocol_type type); #define QED_CTX_WORKING_MEM 0 #define QED_CTX_FL_MEM 1 diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 9a8e153df841..754f6a908858 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -35,9 +35,13 @@ #include "qed_sp.h" #include "qed_sriov.h" #include "qed_vf.h" +#include "qed_roce.h" static DEFINE_SPINLOCK(qm_lock); +#define QED_MIN_DPIS (4) +#define QED_MIN_PWM_REGION (QED_WID_SIZE * QED_MIN_DPIS) + /* API common to all protocols */ enum BAR_ID { BAR_ID_0, /* used for GRC */ @@ -787,6 +791,136 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn, return rc; } +static int +qed_hw_init_dpi_size(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, u32 pwm_region_size, u32 n_cpus) +{ + u32 dpi_page_size_1, dpi_page_size_2, dpi_page_size; + u32 dpi_bit_shift, dpi_count; + u32 min_dpis; + + /* Calculate DPI size */ + dpi_page_size_1 = QED_WID_SIZE * n_cpus; + dpi_page_size_2 = max_t(u32, QED_WID_SIZE, PAGE_SIZE); + dpi_page_size = max_t(u32, dpi_page_size_1, dpi_page_size_2); + dpi_page_size = roundup_pow_of_two(dpi_page_size); + dpi_bit_shift = ilog2(dpi_page_size / 4096); + + dpi_count = pwm_region_size / dpi_page_size; + + min_dpis = p_hwfn->pf_params.rdma_pf_params.min_dpis; + min_dpis = max_t(u32, QED_MIN_DPIS, min_dpis); + + p_hwfn->dpi_size = dpi_page_size; + p_hwfn->dpi_count = dpi_count; + + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DPI_BIT_SHIFT, dpi_bit_shift); + + if (dpi_count < min_dpis) + return -EINVAL; + + return 0; +} + +enum QED_ROCE_EDPM_MODE { + QED_ROCE_EDPM_MODE_ENABLE = 0, + QED_ROCE_EDPM_MODE_FORCE_ON = 1, + QED_ROCE_EDPM_MODE_DISABLE = 2, +}; + +static int +qed_hw_init_pf_doorbell_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) +{ + u32 pwm_regsize, norm_regsize; + u32 non_pwm_conn, min_addr_reg1; + u32 db_bar_size, n_cpus; + u32 roce_edpm_mode; + u32 pf_dems_shift; + int rc = 0; + u8 cond; + + db_bar_size = qed_hw_bar_size(p_hwfn, BAR_ID_1); + if (p_hwfn->cdev->num_hwfns > 1) + db_bar_size /= 2; + + /* Calculate doorbell regions */ + non_pwm_conn = qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_CORE) + + qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_CORE, + NULL) + + qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH, + NULL); + norm_regsize = roundup(QED_PF_DEMS_SIZE * non_pwm_conn, 4096); + min_addr_reg1 = norm_regsize / 4096; + pwm_regsize = db_bar_size - norm_regsize; + + /* Check that the normal and PWM sizes are valid */ + if (db_bar_size < norm_regsize) { + DP_ERR(p_hwfn->cdev, + "Doorbell BAR size 0x%x is too small (normal region is 0x%0x )\n", + db_bar_size, norm_regsize); + return -EINVAL; + } + + if (pwm_regsize < QED_MIN_PWM_REGION) { + DP_ERR(p_hwfn->cdev, + "PWM region size 0x%0x is too small. Should be at least 0x%0x (Doorbell BAR size is 0x%x and normal region size is 0x%0x)\n", + pwm_regsize, + QED_MIN_PWM_REGION, db_bar_size, norm_regsize); + return -EINVAL; + } + + /* Calculate number of DPIs */ + roce_edpm_mode = p_hwfn->pf_params.rdma_pf_params.roce_edpm_mode; + if ((roce_edpm_mode == QED_ROCE_EDPM_MODE_ENABLE) || + ((roce_edpm_mode == QED_ROCE_EDPM_MODE_FORCE_ON))) { + /* Either EDPM is mandatory, or we are attempting to allocate a + * WID per CPU. + */ + n_cpus = num_active_cpus(); + rc = qed_hw_init_dpi_size(p_hwfn, p_ptt, pwm_regsize, n_cpus); + } + + cond = (rc && (roce_edpm_mode == QED_ROCE_EDPM_MODE_ENABLE)) || + (roce_edpm_mode == QED_ROCE_EDPM_MODE_DISABLE); + if (cond || p_hwfn->dcbx_no_edpm) { + /* Either EDPM is disabled from user configuration, or it is + * disabled via DCBx, or it is not mandatory and we failed to + * allocated a WID per CPU. + */ + n_cpus = 1; + rc = qed_hw_init_dpi_size(p_hwfn, p_ptt, pwm_regsize, n_cpus); + + if (cond) + qed_rdma_dpm_bar(p_hwfn, p_ptt); + } + + DP_INFO(p_hwfn, + "doorbell bar: normal_region_size=%d, pwm_region_size=%d, dpi_size=%d, dpi_count=%d, roce_edpm=%s\n", + norm_regsize, + pwm_regsize, + p_hwfn->dpi_size, + p_hwfn->dpi_count, + ((p_hwfn->dcbx_no_edpm) || (p_hwfn->db_bar_no_edpm)) ? + "disabled" : "enabled"); + + if (rc) { + DP_ERR(p_hwfn, + "Failed to allocate enough DPIs. Allocated %d but the current minimum is %d.\n", + p_hwfn->dpi_count, + p_hwfn->pf_params.rdma_pf_params.min_dpis); + return -EINVAL; + } + + p_hwfn->dpi_start_offset = norm_regsize; + + /* DEMS size is configured log2 of DWORDs, hence the division by 4 */ + pf_dems_shift = ilog2(QED_PF_DEMS_SIZE / 4); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_ICID_BIT_SHIFT_NORM, pf_dems_shift); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_MIN_ADDR_REG1, min_addr_reg1); + + return 0; +} + static int qed_hw_init_port(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, int hw_mode) { @@ -860,6 +994,10 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn, /* Pure runtime initializations - directly to the HW */ qed_int_igu_init_pure_rt(p_hwfn, p_ptt, true, true); + rc = qed_hw_init_pf_doorbell_bar(p_hwfn, p_ptt); + if (rc) + return rc; + if (b_hw_start) { /* enable interrupts */ qed_int_igu_enable(p_hwfn, p_ptt, int_mode); @@ -1284,6 +1422,19 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn) u32 *feat_num = p_hwfn->hw_info.feat_num; int num_features = 1; +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) + /* Roce CNQ each requires: 1 status block + 1 CNQ. We divide the + * status blocks equally between L2 / RoCE but with consideration as + * to how many l2 queues / cnqs we have + */ + if (p_hwfn->hw_info.personality == QED_PCI_ETH_ROCE) { + num_features++; + + feat_num[QED_RDMA_CNQ] = + min_t(u32, RESC_NUM(p_hwfn, QED_SB) / num_features, + RESC_NUM(p_hwfn, QED_RDMA_CNQ_RAM)); + } +#endif feat_num[QED_PF_L2_QUE] = min_t(u32, RESC_NUM(p_hwfn, QED_SB) / num_features, RESC_NUM(p_hwfn, QED_L2_QUEUE)); @@ -1325,6 +1476,9 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn) num_funcs; resc_num[QED_ILT] = PXP_NUM_ILT_RECORDS_BB / num_funcs; resc_num[QED_LL2_QUEUE] = MAX_NUM_LL2_RX_QUEUES / num_funcs; + resc_num[QED_RDMA_CNQ_RAM] = NUM_OF_CMDQS_CQS / num_funcs; + resc_num[QED_RDMA_STATS_QUEUE] = RDMA_NUM_STATISTIC_COUNTERS_BB / + num_funcs; for (i = 0; i < QED_MAX_RESC; i++) resc_start[i] = resc_num[i] * enabled_func_idx; diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 48cdf62c025b..4ee3151e80c2 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -33,6 +33,11 @@ #include "qed_hw.h" #include "qed_selftest.h" +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +#define QED_ROCE_QPS (8192) +#define QED_ROCE_DPIS (8) +#endif + static char version[] = "QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n"; @@ -206,8 +211,8 @@ int qed_fill_dev_info(struct qed_dev *cdev, dev_info->pci_mem_start = cdev->pci_params.mem_start; dev_info->pci_mem_end = cdev->pci_params.mem_end; dev_info->pci_irq = cdev->pci_params.irq; - dev_info->rdma_supported = - (cdev->hwfns[0].hw_info.personality == QED_PCI_ETH_ROCE); + dev_info->rdma_supported = (cdev->hwfns[0].hw_info.personality == + QED_PCI_ETH_ROCE); dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]); ether_addr_copy(dev_info->hw_mac, cdev->hwfns[0].hw_info.hw_mac_addr); @@ -677,6 +682,9 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, enum qed_int_mode int_mode) { struct qed_sb_cnt_info sb_cnt_info; +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) + int num_l2_queues; +#endif int rc; int i; @@ -707,6 +715,31 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors - cdev->num_hwfns; +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) + num_l2_queues = 0; + for_each_hwfn(cdev, i) + num_l2_queues += FEAT_NUM(&cdev->hwfns[i], QED_PF_L2_QUE); + + DP_VERBOSE(cdev, QED_MSG_RDMA, + "cdev->int_params.fp_msix_cnt=%d num_l2_queues=%d\n", + cdev->int_params.fp_msix_cnt, num_l2_queues); + + if (cdev->int_params.fp_msix_cnt > num_l2_queues) { + cdev->int_params.rdma_msix_cnt = + (cdev->int_params.fp_msix_cnt - num_l2_queues) + / cdev->num_hwfns; + cdev->int_params.rdma_msix_base = + cdev->int_params.fp_msix_base + num_l2_queues; + cdev->int_params.fp_msix_cnt = num_l2_queues; + } else { + cdev->int_params.rdma_msix_cnt = 0; + } + + DP_VERBOSE(cdev, QED_MSG_RDMA, "roce_msix_cnt=%d roce_msix_base=%d\n", + cdev->int_params.rdma_msix_cnt, + cdev->int_params.rdma_msix_base); +#endif + return 0; } @@ -810,6 +843,13 @@ static void qed_update_pf_params(struct qed_dev *cdev, { int i; +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) + params->rdma_pf_params.num_qps = QED_ROCE_QPS; + params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; + /* divide by 3 the MRs to avoid MF ILT overflow */ + params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; + params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; +#endif for (i = 0; i < cdev->num_hwfns; i++) { struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h index e75738d21783..b414a0542177 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h +++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h @@ -1448,5 +1448,11 @@ 0x620000UL #define PHY_PCIE_REG_PHY1 \ 0x624000UL - +#define NIG_REG_ROCE_DUPLICATE_TO_HOST 0x5088f0UL +#define PRS_REG_LIGHT_L2_ETHERTYPE_EN 0x1f0968UL +#define NIG_REG_LLH_ENG_CLS_ENG_ID_TBL 0x501b90UL +#define DORQ_REG_PF_DPM_ENABLE 0x100510UL +#define DORQ_REG_PF_ICID_BIT_SHIFT_NORM 0x100448UL +#define DORQ_REG_PF_MIN_ADDR_REG1 0x100400UL +#define DORQ_REG_PF_DPI_BIT_SHIFT 0x100450UL #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c new file mode 100644 index 000000000000..4c53b857cc1c --- /dev/null +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -0,0 +1,886 @@ +/* QLogic qed NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "qed.h" +#include "qed_cxt.h" +#include "qed_hsi.h" +#include "qed_hw.h" +#include "qed_init_ops.h" +#include "qed_int.h" +#include "qed_ll2.h" +#include "qed_mcp.h" +#include "qed_reg_addr.h" +#include "qed_sp.h" +#include "qed_roce.h" + +void qed_async_roce_event(struct qed_hwfn *p_hwfn, + struct event_ring_entry *p_eqe) +{ + struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info; + + p_rdma_info->events.affiliated_event(p_rdma_info->events.context, + p_eqe->opcode, &p_eqe->data); +} + +static int qed_rdma_bmap_alloc(struct qed_hwfn *p_hwfn, + struct qed_bmap *bmap, u32 max_count) +{ + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "max_count = %08x\n", max_count); + + bmap->max_count = max_count; + + bmap->bitmap = kzalloc(BITS_TO_LONGS(max_count) * sizeof(long), + GFP_KERNEL); + if (!bmap->bitmap) { + DP_NOTICE(p_hwfn, + "qed bmap alloc failed: cannot allocate memory (bitmap)\n"); + return -ENOMEM; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocated bitmap %p\n", + bmap->bitmap); + return 0; +} + +static int qed_rdma_bmap_alloc_id(struct qed_hwfn *p_hwfn, + struct qed_bmap *bmap, u32 *id_num) +{ + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "bmap = %p\n", bmap); + + *id_num = find_first_zero_bit(bmap->bitmap, bmap->max_count); + + if (*id_num >= bmap->max_count) { + DP_NOTICE(p_hwfn, "no id available max_count=%d\n", + bmap->max_count); + return -EINVAL; + } + + __set_bit(*id_num, bmap->bitmap); + + return 0; +} + +static void qed_bmap_release_id(struct qed_hwfn *p_hwfn, + struct qed_bmap *bmap, u32 id_num) +{ + bool b_acquired; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "id_num = %08x", id_num); + if (id_num >= bmap->max_count) + return; + + b_acquired = test_and_clear_bit(id_num, bmap->bitmap); + if (!b_acquired) { + DP_NOTICE(p_hwfn, "ID %d already released\n", id_num); + return; + } +} + +u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id) +{ + /* First sb id for RoCE is after all the l2 sb */ + return FEAT_NUM((struct qed_hwfn *)p_hwfn, QED_PF_L2_QUE) + rel_sb_id; +} + +u32 qed_rdma_query_cau_timer_res(void *rdma_cxt) +{ + return QED_CAU_DEF_RX_TIMER_RES; +} + +static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_rdma_start_in_params *params) +{ + struct qed_rdma_info *p_rdma_info; + u32 num_cons, num_tasks; + int rc = -ENOMEM; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocating RDMA\n"); + + /* Allocate a struct with current pf rdma info */ + p_rdma_info = kzalloc(sizeof(*p_rdma_info), GFP_KERNEL); + if (!p_rdma_info) { + DP_NOTICE(p_hwfn, + "qed rdma alloc failed: cannot allocate memory (rdma info). rc = %d\n", + rc); + return rc; + } + + p_hwfn->p_rdma_info = p_rdma_info; + p_rdma_info->proto = PROTOCOLID_ROCE; + + num_cons = qed_cxt_get_proto_cid_count(p_hwfn, p_rdma_info->proto, 0); + + p_rdma_info->num_qps = num_cons / 2; + + num_tasks = qed_cxt_get_proto_tid_count(p_hwfn, PROTOCOLID_ROCE); + + /* Each MR uses a single task */ + p_rdma_info->num_mrs = num_tasks; + + /* Queue zone lines are shared between RoCE and L2 in such a way that + * they can be used by each without obstructing the other. + */ + p_rdma_info->queue_zone_base = (u16)FEAT_NUM(p_hwfn, QED_L2_QUEUE); + + /* Allocate a struct with device params and fill it */ + p_rdma_info->dev = kzalloc(sizeof(*p_rdma_info->dev), GFP_KERNEL); + if (!p_rdma_info->dev) { + DP_NOTICE(p_hwfn, + "qed rdma alloc failed: cannot allocate memory (rdma info dev). rc = %d\n", + rc); + goto free_rdma_info; + } + + /* Allocate a struct with port params and fill it */ + p_rdma_info->port = kzalloc(sizeof(*p_rdma_info->port), GFP_KERNEL); + if (!p_rdma_info->port) { + DP_NOTICE(p_hwfn, + "qed rdma alloc failed: cannot allocate memory (rdma info port). rc = %d\n", + rc); + goto free_rdma_dev; + } + + /* Allocate bit map for pd's */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->pd_map, RDMA_MAX_PDS); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate pd_map, rc = %d\n", + rc); + goto free_rdma_port; + } + + /* Allocate DPI bitmap */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->dpi_map, + p_hwfn->dpi_count); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate DPI bitmap, rc = %d\n", rc); + goto free_pd_map; + } + + /* Allocate bitmap for cq's. The maximum number of CQs is bounded to + * twice the number of QPs. + */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cq_map, + p_rdma_info->num_qps * 2); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate cq bitmap, rc = %d\n", rc); + goto free_dpi_map; + } + + /* Allocate bitmap for toggle bit for cq icids + * We toggle the bit every time we create or resize cq for a given icid. + * The maximum number of CQs is bounded to twice the number of QPs. + */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->toggle_bits, + p_rdma_info->num_qps * 2); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate toogle bits, rc = %d\n", rc); + goto free_cq_map; + } + + /* Allocate bitmap for itids */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->tid_map, + p_rdma_info->num_mrs); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate itids bitmaps, rc = %d\n", rc); + goto free_toggle_map; + } + + /* Allocate bitmap for cids used for qps. */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cid_map, num_cons); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate cid bitmap, rc = %d\n", rc); + goto free_tid_map; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocation successful\n"); + return 0; + +free_tid_map: + kfree(p_rdma_info->tid_map.bitmap); +free_toggle_map: + kfree(p_rdma_info->toggle_bits.bitmap); +free_cq_map: + kfree(p_rdma_info->cq_map.bitmap); +free_dpi_map: + kfree(p_rdma_info->dpi_map.bitmap); +free_pd_map: + kfree(p_rdma_info->pd_map.bitmap); +free_rdma_port: + kfree(p_rdma_info->port); +free_rdma_dev: + kfree(p_rdma_info->dev); +free_rdma_info: + kfree(p_rdma_info); + + return rc; +} + +void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) +{ + struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info; + + kfree(p_rdma_info->cid_map.bitmap); + kfree(p_rdma_info->tid_map.bitmap); + kfree(p_rdma_info->toggle_bits.bitmap); + kfree(p_rdma_info->cq_map.bitmap); + kfree(p_rdma_info->dpi_map.bitmap); + kfree(p_rdma_info->pd_map.bitmap); + + kfree(p_rdma_info->port); + kfree(p_rdma_info->dev); + + kfree(p_rdma_info); +} + +static void qed_rdma_free(struct qed_hwfn *p_hwfn) +{ + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Freeing RDMA\n"); + + qed_rdma_resc_free(p_hwfn); +} + +static void qed_rdma_get_guid(struct qed_hwfn *p_hwfn, u8 *guid) +{ + guid[0] = p_hwfn->hw_info.hw_mac_addr[0] ^ 2; + guid[1] = p_hwfn->hw_info.hw_mac_addr[1]; + guid[2] = p_hwfn->hw_info.hw_mac_addr[2]; + guid[3] = 0xff; + guid[4] = 0xfe; + guid[5] = p_hwfn->hw_info.hw_mac_addr[3]; + guid[6] = p_hwfn->hw_info.hw_mac_addr[4]; + guid[7] = p_hwfn->hw_info.hw_mac_addr[5]; +} + +static void qed_rdma_init_events(struct qed_hwfn *p_hwfn, + struct qed_rdma_start_in_params *params) +{ + struct qed_rdma_events *events; + + events = &p_hwfn->p_rdma_info->events; + + events->unaffiliated_event = params->events->unaffiliated_event; + events->affiliated_event = params->events->affiliated_event; + events->context = params->events->context; +} + +static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn, + struct qed_rdma_start_in_params *params) +{ + struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev; + struct qed_dev *cdev = p_hwfn->cdev; + u32 pci_status_control; + u32 num_qps; + + /* Vendor specific information */ + dev->vendor_id = cdev->vendor_id; + dev->vendor_part_id = cdev->device_id; + dev->hw_ver = 0; + dev->fw_ver = (FW_MAJOR_VERSION << 24) | (FW_MINOR_VERSION << 16) | + (FW_REVISION_VERSION << 8) | (FW_ENGINEERING_VERSION); + + qed_rdma_get_guid(p_hwfn, (u8 *)&dev->sys_image_guid); + dev->node_guid = dev->sys_image_guid; + + dev->max_sge = min_t(u32, RDMA_MAX_SGE_PER_SQ_WQE, + RDMA_MAX_SGE_PER_RQ_WQE); + + if (cdev->rdma_max_sge) + dev->max_sge = min_t(u32, cdev->rdma_max_sge, dev->max_sge); + + dev->max_inline = ROCE_REQ_MAX_INLINE_DATA_SIZE; + + dev->max_inline = (cdev->rdma_max_inline) ? + min_t(u32, cdev->rdma_max_inline, dev->max_inline) : + dev->max_inline; + + dev->max_wqe = QED_RDMA_MAX_WQE; + dev->max_cnq = (u8)FEAT_NUM(p_hwfn, QED_RDMA_CNQ); + + /* The number of QPs may be higher than QED_ROCE_MAX_QPS, because + * it is up-aligned to 16 and then to ILT page size within qed cxt. + * This is OK in terms of ILT but we don't want to configure the FW + * above its abilities + */ + num_qps = ROCE_MAX_QPS; + num_qps = min_t(u64, num_qps, p_hwfn->p_rdma_info->num_qps); + dev->max_qp = num_qps; + + /* CQs uses the same icids that QPs use hence they are limited by the + * number of icids. There are two icids per QP. + */ + dev->max_cq = num_qps * 2; + + /* The number of mrs is smaller by 1 since the first is reserved */ + dev->max_mr = p_hwfn->p_rdma_info->num_mrs - 1; + dev->max_mr_size = QED_RDMA_MAX_MR_SIZE; + + /* The maximum CQE capacity per CQ supported. + * max number of cqes will be in two layer pbl, + * 8 is the pointer size in bytes + * 32 is the size of cq element in bytes + */ + if (params->cq_mode == QED_RDMA_CQ_MODE_32_BITS) + dev->max_cqe = QED_RDMA_MAX_CQE_32_BIT; + else + dev->max_cqe = QED_RDMA_MAX_CQE_16_BIT; + + dev->max_mw = 0; + dev->max_fmr = QED_RDMA_MAX_FMR; + dev->max_mr_mw_fmr_pbl = (PAGE_SIZE / 8) * (PAGE_SIZE / 8); + dev->max_mr_mw_fmr_size = dev->max_mr_mw_fmr_pbl * PAGE_SIZE; + dev->max_pkey = QED_RDMA_MAX_P_KEY; + + dev->max_qp_resp_rd_atomic_resc = RDMA_RING_PAGE_SIZE / + (RDMA_RESP_RD_ATOMIC_ELM_SIZE * 2); + dev->max_qp_req_rd_atomic_resc = RDMA_RING_PAGE_SIZE / + RDMA_REQ_RD_ATOMIC_ELM_SIZE; + dev->max_dev_resp_rd_atomic_resc = dev->max_qp_resp_rd_atomic_resc * + p_hwfn->p_rdma_info->num_qps; + dev->page_size_caps = QED_RDMA_PAGE_SIZE_CAPS; + dev->dev_ack_delay = QED_RDMA_ACK_DELAY; + dev->max_pd = RDMA_MAX_PDS; + dev->max_ah = p_hwfn->p_rdma_info->num_qps; + dev->max_stats_queues = (u8)RESC_NUM(p_hwfn, QED_RDMA_STATS_QUEUE); + + /* Set capablities */ + dev->dev_caps = 0; + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_RNR_NAK, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_RESIZE_CQ, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_BASE_MEMORY_EXT, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_BASE_QUEUE_EXT, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_ZBVA, 1); + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_LOCAL_INV_FENCE, 1); + + /* Check atomic operations support in PCI configuration space. */ + pci_read_config_dword(cdev->pdev, + cdev->pdev->pcie_cap + PCI_EXP_DEVCTL2, + &pci_status_control); + + if (pci_status_control & PCI_EXP_DEVCTL2_LTR_EN) + SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_ATOMIC_OP, 1); +} + +static void qed_rdma_init_port(struct qed_hwfn *p_hwfn) +{ + struct qed_rdma_port *port = p_hwfn->p_rdma_info->port; + struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev; + + port->port_state = p_hwfn->mcp_info->link_output.link_up ? + QED_RDMA_PORT_UP : QED_RDMA_PORT_DOWN; + + port->max_msg_size = min_t(u64, + (dev->max_mr_mw_fmr_size * + p_hwfn->cdev->rdma_max_sge), + BIT(31)); + + port->pkey_bad_counter = 0; +} + +static int qed_rdma_init_hw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) +{ + u32 ll2_ethertype_en; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Initializing HW\n"); + p_hwfn->b_rdma_enabled_in_prs = false; + + qed_wr(p_hwfn, p_ptt, PRS_REG_ROCE_DEST_QP_MAX_PF, 0); + + p_hwfn->rdma_prs_search_reg = PRS_REG_SEARCH_ROCE; + + /* We delay writing to this reg until first cid is allocated. See + * qed_cxt_dynamic_ilt_alloc function for more details + */ + ll2_ethertype_en = qed_rd(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN); + qed_wr(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN, + (ll2_ethertype_en | 0x01)); + + if (qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_ROCE) % 2) { + DP_NOTICE(p_hwfn, "The first RoCE's cid should be even\n"); + return -EINVAL; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Initializing HW - Done\n"); + return 0; +} + +static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn, + struct qed_rdma_start_in_params *params, + struct qed_ptt *p_ptt) +{ + struct rdma_init_func_ramrod_data *p_ramrod; + struct qed_rdma_cnq_params *p_cnq_pbl_list; + struct rdma_init_func_hdr *p_params_header; + struct rdma_cnq_params *p_cnq_params; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + u32 cnq_id, sb_id; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Starting FW\n"); + + /* Save the number of cnqs for the function close ramrod */ + p_hwfn->p_rdma_info->num_cnqs = params->desired_cnq; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_FUNC_INIT, + p_hwfn->p_rdma_info->proto, &init_data); + if (rc) + return rc; + + p_ramrod = &p_ent->ramrod.roce_init_func.rdma; + + p_params_header = &p_ramrod->params_header; + p_params_header->cnq_start_offset = (u8)RESC_START(p_hwfn, + QED_RDMA_CNQ_RAM); + p_params_header->num_cnqs = params->desired_cnq; + + if (params->cq_mode == QED_RDMA_CQ_MODE_16_BITS) + p_params_header->cq_ring_mode = 1; + else + p_params_header->cq_ring_mode = 0; + + for (cnq_id = 0; cnq_id < params->desired_cnq; cnq_id++) { + sb_id = qed_rdma_get_sb_id(p_hwfn, cnq_id); + p_cnq_params = &p_ramrod->cnq_params[cnq_id]; + p_cnq_pbl_list = ¶ms->cnq_pbl_list[cnq_id]; + p_cnq_params->sb_num = + cpu_to_le16(p_hwfn->sbs_info[sb_id]->igu_sb_id); + + p_cnq_params->sb_index = p_hwfn->pf_params.rdma_pf_params.gl_pi; + p_cnq_params->num_pbl_pages = p_cnq_pbl_list->num_pbl_pages; + + DMA_REGPAIR_LE(p_cnq_params->pbl_base_addr, + p_cnq_pbl_list->pbl_ptr); + + /* we assume here that cnq_id and qz_offset are the same */ + p_cnq_params->queue_zone_num = + cpu_to_le16(p_hwfn->p_rdma_info->queue_zone_base + + cnq_id); + } + + return qed_spq_post(p_hwfn, p_ent, NULL); +} + +static int qed_rdma_reserve_lkey(struct qed_hwfn *p_hwfn) +{ + struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev; + + /* The first DPI is reserved for the Kernel */ + __set_bit(0, p_hwfn->p_rdma_info->dpi_map.bitmap); + + /* Tid 0 will be used as the key for "reserved MR". + * The driver should allocate memory for it so it can be loaded but no + * ramrod should be passed on it. + */ + qed_rdma_alloc_tid(p_hwfn, &dev->reserved_lkey); + if (dev->reserved_lkey != RDMA_RESERVED_LKEY) { + DP_NOTICE(p_hwfn, + "Reserved lkey should be equal to RDMA_RESERVED_LKEY\n"); + return -EINVAL; + } + + return 0; +} + +static int qed_rdma_setup(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_rdma_start_in_params *params) +{ + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA setup\n"); + + spin_lock_init(&p_hwfn->p_rdma_info->lock); + + qed_rdma_init_devinfo(p_hwfn, params); + qed_rdma_init_port(p_hwfn); + qed_rdma_init_events(p_hwfn, params); + + rc = qed_rdma_reserve_lkey(p_hwfn); + if (rc) + return rc; + + rc = qed_rdma_init_hw(p_hwfn, p_ptt); + if (rc) + return rc; + + return qed_rdma_start_fw(p_hwfn, params, p_ptt); +} + +int qed_rdma_stop(void *rdma_cxt) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct rdma_close_func_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + struct qed_ptt *p_ptt; + u32 ll2_ethertype_en; + int rc = -EBUSY; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA stop\n"); + + p_ptt = qed_ptt_acquire(p_hwfn); + if (!p_ptt) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to acquire PTT\n"); + return rc; + } + + /* Disable RoCE search */ + qed_wr(p_hwfn, p_ptt, p_hwfn->rdma_prs_search_reg, 0); + p_hwfn->b_rdma_enabled_in_prs = false; + + qed_wr(p_hwfn, p_ptt, PRS_REG_ROCE_DEST_QP_MAX_PF, 0); + + ll2_ethertype_en = qed_rd(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN); + + qed_wr(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN, + (ll2_ethertype_en & 0xFFFE)); + + qed_ptt_release(p_hwfn, p_ptt); + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + /* Stop RoCE */ + rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_FUNC_CLOSE, + p_hwfn->p_rdma_info->proto, &init_data); + if (rc) + goto out; + + p_ramrod = &p_ent->ramrod.rdma_close_func; + + p_ramrod->num_cnqs = p_hwfn->p_rdma_info->num_cnqs; + p_ramrod->cnq_start_offset = (u8)RESC_START(p_hwfn, QED_RDMA_CNQ_RAM); + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + +out: + qed_rdma_free(p_hwfn); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA stop done, rc = %d\n", rc); + return rc; +} + +int qed_rdma_add_user(void *rdma_cxt, + struct qed_rdma_add_user_out_params *out_params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + u32 dpi_start_offset; + u32 returned_id = 0; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Adding User\n"); + + /* Allocate DPI */ + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_hwfn->p_rdma_info->dpi_map, + &returned_id); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + + out_params->dpi = (u16)returned_id; + + /* Calculate the corresponding DPI address */ + dpi_start_offset = p_hwfn->dpi_start_offset; + + out_params->dpi_addr = (u64)((u8 __iomem *)p_hwfn->doorbells + + dpi_start_offset + + ((out_params->dpi) * p_hwfn->dpi_size)); + + out_params->dpi_phys_addr = p_hwfn->cdev->db_phys_addr + + dpi_start_offset + + ((out_params->dpi) * p_hwfn->dpi_size); + + out_params->dpi_size = p_hwfn->dpi_size; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Adding user - done, rc = %d\n", rc); + return rc; +} + +struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Query device\n"); + + /* Return struct with device parameters */ + return p_hwfn->p_rdma_info->dev; +} + +int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID\n"); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, + &p_hwfn->p_rdma_info->tid_map, itid); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + if (rc) + goto out; + + rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_TASK, *itid); +out: + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID - done, rc = %d\n", rc); + return rc; +} + +void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod) +{ + struct qed_hwfn *p_hwfn; + u16 qz_num; + u32 addr; + + p_hwfn = (struct qed_hwfn *)rdma_cxt; + qz_num = p_hwfn->p_rdma_info->queue_zone_base + qz_offset; + addr = GTT_BAR0_MAP_REG_USDM_RAM + + USTORM_COMMON_QUEUE_CONS_OFFSET(qz_num); + + REG_WR16(p_hwfn, addr, prod); + + /* keep prod updates ordered */ + wmb(); +} + +static int qed_fill_rdma_dev_info(struct qed_dev *cdev, + struct qed_dev_rdma_info *info) +{ + memset(info, 0, sizeof(*info)); + + info->rdma_type = QED_RDMA_TYPE_ROCE; + + qed_fill_dev_info(cdev, &info->common); + + return 0; +} + +static int qed_rdma_get_sb_start(struct qed_dev *cdev) +{ + int feat_num; + + if (cdev->num_hwfns > 1) + feat_num = FEAT_NUM(QED_LEADING_HWFN(cdev), QED_PF_L2_QUE); + else + feat_num = FEAT_NUM(QED_LEADING_HWFN(cdev), QED_PF_L2_QUE) * + cdev->num_hwfns; + + return feat_num; +} + +static int qed_rdma_get_min_cnq_msix(struct qed_dev *cdev) +{ + int n_cnq = FEAT_NUM(QED_LEADING_HWFN(cdev), QED_RDMA_CNQ); + int n_msix = cdev->int_params.rdma_msix_cnt; + + return min_t(int, n_cnq, n_msix); +} + +static int qed_rdma_set_int(struct qed_dev *cdev, u16 cnt) +{ + int limit = 0; + + /* Mark the fastpath as free/used */ + cdev->int_params.fp_initialized = cnt ? true : false; + + if (cdev->int_params.out.int_mode != QED_INT_MODE_MSIX) { + DP_ERR(cdev, + "qed roce supports only MSI-X interrupts (detected %d).\n", + cdev->int_params.out.int_mode); + return -EINVAL; + } else if (cdev->int_params.fp_msix_cnt) { + limit = cdev->int_params.rdma_msix_cnt; + } + + if (!limit) + return -ENOMEM; + + return min_t(int, cnt, limit); +} + +static int qed_rdma_get_int(struct qed_dev *cdev, struct qed_int_info *info) +{ + memset(info, 0, sizeof(*info)); + + if (!cdev->int_params.fp_initialized) { + DP_INFO(cdev, + "Protocol driver requested interrupt information, but its support is not yet configured\n"); + return -EINVAL; + } + + if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) { + int msix_base = cdev->int_params.rdma_msix_base; + + info->msix_cnt = cdev->int_params.rdma_msix_cnt; + info->msix = &cdev->int_params.msix_table[msix_base]; + + DP_VERBOSE(cdev, QED_MSG_RDMA, "msix_cnt = %d msix_base=%d\n", + info->msix_cnt, msix_base); + } + + return 0; +} + +static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev) +{ + return QED_LEADING_HWFN(cdev); +} + +static void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) +{ + u32 val; + + val = (p_hwfn->dcbx_no_edpm || p_hwfn->db_bar_no_edpm) ? 0 : 1; + + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DPM_ENABLE, val); + DP_VERBOSE(p_hwfn, (QED_MSG_DCB | QED_MSG_RDMA), + "Changing DPM_EN state to %d (DCBX=%d, DB_BAR=%d)\n", + val, p_hwfn->dcbx_no_edpm, p_hwfn->db_bar_no_edpm); +} + +void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) +{ + p_hwfn->db_bar_no_edpm = true; + + qed_rdma_dpm_conf(p_hwfn, p_ptt); +} + +int qed_rdma_start(void *rdma_cxt, struct qed_rdma_start_in_params *params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct qed_ptt *p_ptt; + int rc = -EBUSY; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "desired_cnq = %08x\n", params->desired_cnq); + + p_ptt = qed_ptt_acquire(p_hwfn); + if (!p_ptt) + goto err; + + rc = qed_rdma_alloc(p_hwfn, p_ptt, params); + if (rc) + goto err1; + + rc = qed_rdma_setup(p_hwfn, p_ptt, params); + if (rc) + goto err2; + + qed_ptt_release(p_hwfn, p_ptt); + + return rc; + +err2: + qed_rdma_free(p_hwfn); +err1: + qed_ptt_release(p_hwfn, p_ptt); +err: + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA start - error, rc = %d\n", rc); + return rc; +} + +static int qed_rdma_init(struct qed_dev *cdev, + struct qed_rdma_start_in_params *params) +{ + return qed_rdma_start(QED_LEADING_HWFN(cdev), params); +} + +void qed_rdma_remove_user(void *rdma_cxt, u16 dpi) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "dpi = %08x\n", dpi); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->dpi_map, dpi); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); +} + +static const struct qed_rdma_ops qed_rdma_ops_pass = { + .common = &qed_common_ops_pass, + .fill_dev_info = &qed_fill_rdma_dev_info, + .rdma_get_rdma_ctx = &qed_rdma_get_rdma_ctx, + .rdma_init = &qed_rdma_init, + .rdma_add_user = &qed_rdma_add_user, + .rdma_remove_user = &qed_rdma_remove_user, + .rdma_stop = &qed_rdma_stop, + .rdma_query_device = &qed_rdma_query_device, + .rdma_get_start_sb = &qed_rdma_get_sb_start, + .rdma_get_rdma_int = &qed_rdma_get_int, + .rdma_set_rdma_int = &qed_rdma_set_int, + .rdma_get_min_cnq_msix = &qed_rdma_get_min_cnq_msix, + .rdma_cnq_prod_update = &qed_rdma_cnq_prod_update, +}; + +const struct qed_rdma_ops *qed_get_rdma_ops() +{ + return &qed_rdma_ops_pass; +} +EXPORT_SYMBOL(qed_get_rdma_ops); diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h new file mode 100644 index 000000000000..e55048106a83 --- /dev/null +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h @@ -0,0 +1,123 @@ +/* QLogic qed NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _QED_ROCE_H +#define _QED_ROCE_H +#include +#include +#include +#include +#include +#include +#include +#include +#include "qed.h" +#include "qed_dev_api.h" +#include "qed_hsi.h" + +#define QED_RDMA_MAX_FMR (RDMA_MAX_TIDS) +#define QED_RDMA_MAX_P_KEY (1) +#define QED_RDMA_MAX_WQE (0x7FFF) +#define QED_RDMA_MAX_SRQ_WQE_ELEM (0x7FFF) +#define QED_RDMA_PAGE_SIZE_CAPS (0xFFFFF000) +#define QED_RDMA_ACK_DELAY (15) +#define QED_RDMA_MAX_MR_SIZE (0x10000000000ULL) +#define QED_RDMA_MAX_CQS (RDMA_MAX_CQS) +#define QED_RDMA_MAX_MRS (RDMA_MAX_TIDS) +/* Add 1 for header element */ +#define QED_RDMA_MAX_SRQ_ELEM_PER_WQE (RDMA_MAX_SGE_PER_RQ_WQE + 1) +#define QED_RDMA_MAX_SGE_PER_SRQ_WQE (RDMA_MAX_SGE_PER_RQ_WQE) +#define QED_RDMA_SRQ_WQE_ELEM_SIZE (16) +#define QED_RDMA_MAX_SRQS (32 * 1024) + +#define QED_RDMA_MAX_CQE_32_BIT (0x7FFFFFFF - 1) +#define QED_RDMA_MAX_CQE_16_BIT (0x7FFF - 1) + +enum qed_rdma_toggle_bit { + QED_RDMA_TOGGLE_BIT_CLEAR = 0, + QED_RDMA_TOGGLE_BIT_SET = 1 +}; + +struct qed_bmap { + unsigned long *bitmap; + u32 max_count; +}; + +struct qed_rdma_info { + /* spin lock to protect bitmaps */ + spinlock_t lock; + + struct qed_bmap cq_map; + struct qed_bmap pd_map; + struct qed_bmap tid_map; + struct qed_bmap qp_map; + struct qed_bmap srq_map; + struct qed_bmap cid_map; + struct qed_bmap dpi_map; + struct qed_bmap toggle_bits; + struct qed_rdma_events events; + struct qed_rdma_device *dev; + struct qed_rdma_port *port; + u32 last_tid; + u8 num_cnqs; + u32 num_qps; + u32 num_mrs; + u16 queue_zone_base; + enum protocol_type proto; +}; + +int +qed_rdma_add_user(void *rdma_cxt, + struct qed_rdma_add_user_out_params *out_params); +int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd); +int qed_rdma_alloc_tid(void *rdma_cxt, u32 *tid); +int qed_rdma_deregister_tid(void *rdma_cxt, u32 tid); +void qed_rdma_free_tid(void *rdma_cxt, u32 tid); +struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt); +int +qed_rdma_register_tid(void *rdma_cxt, + struct qed_rdma_register_tid_in_params *params); +void qed_rdma_remove_user(void *rdma_cxt, u16 dpi); +int qed_rdma_start(void *p_hwfn, struct qed_rdma_start_in_params *params); +int qed_rdma_stop(void *rdma_cxt); +u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id); +u32 qed_rdma_query_cau_timer_res(void *p_hwfn); +void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 cnq_index, u16 prod); +void qed_rdma_resc_free(struct qed_hwfn *p_hwfn); +void qed_async_roce_event(struct qed_hwfn *p_hwfn, + struct event_ring_entry *p_eqe); + +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); +#else +void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {} +#endif +#endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h index a3c539f1c2ac..652c90819758 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h @@ -85,6 +85,7 @@ union ramrod_data { struct rdma_srq_create_ramrod_data rdma_create_srq; struct rdma_srq_destroy_ramrod_data rdma_destroy_srq; struct rdma_srq_modify_ramrod_data rdma_modify_srq; + struct roce_init_func_ramrod_data roce_init_func; struct iscsi_slow_path_hdr iscsi_empty; struct iscsi_init_ramrod_params iscsi_init; diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index 349af182d085..caff41544898 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -28,6 +28,9 @@ #include "qed_reg_addr.h" #include "qed_sp.h" #include "qed_sriov.h" +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +#include "qed_roce.h" +#endif /*************************************************************************** * Structures & Definitions @@ -237,6 +240,11 @@ qed_async_event_completion(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) { switch (p_eqe->protocol_id) { +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) + case PROTOCOLID_ROCE: + qed_async_roce_event(p_hwfn, p_eqe); + return 0; +#endif case PROTOCOLID_COMMON: return qed_sriov_eqe_event(p_hwfn, p_eqe->opcode, diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index a4a3cead15bb..d2d6621fe0e5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -1851,8 +1851,8 @@ static void qed_iov_vf_mbx_start_txq_resp(struct qed_hwfn *p_hwfn, if ((status == PFVF_STATUS_SUCCESS) && !b_legacy) { u16 qid = mbx->req_virt->start_txq.tx_qid; - p_tlv->offset = qed_db_addr(p_vf->vf_queues[qid].fw_cid, - DQ_DEMS_LEGACY); + p_tlv->offset = qed_db_addr_vf(p_vf->vf_queues[qid].fw_cid, + DQ_DEMS_LEGACY); } qed_iov_send_response(p_hwfn, p_ptt, p_vf, length, status); diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 85334ceaf69c..abf5bf11f865 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -544,7 +544,7 @@ int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn, u8 cid = p_iov->acquire_resp.resc.cid[tx_queue_id]; u32 db_addr; - db_addr = qed_db_addr(cid, DQ_DEMS_LEGACY); + db_addr = qed_db_addr_vf(cid, DQ_DEMS_LEGACY); *pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + db_addr; } diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 19027635df0d..734deb094618 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -674,6 +674,7 @@ union event_ring_data { struct iscsi_eqe_data iscsi_info; struct malicious_vf_eqe_data malicious_vf; struct initial_cleanup_eqe_data vf_init_cleanup; + struct regpair roce_handle; }; /* Event Ring Entry */ diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index e313742b571d..f9ae903bbb84 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -34,6 +34,8 @@ enum dcbx_protocol_type { DCBX_MAX_PROTOCOL_TYPE }; +#define QED_ROCE_PROTOCOL_INDEX (3) + #ifdef CONFIG_DCB #define QED_LLDP_CHASSIS_ID_STAT_LEN 4 #define QED_LLDP_PORT_ID_STAT_LEN 4 @@ -268,6 +270,7 @@ struct qed_dev_info { enum qed_sb_type { QED_SB_TYPE_L2_QUEUE, + QED_SB_TYPE_CNQ, }; enum qed_protocol { @@ -628,7 +631,7 @@ enum DP_MODULE { QED_MSG_CXT = 0x800000, QED_MSG_LL2 = 0x1000000, QED_MSG_ILT = 0x2000000, - QED_MSG_ROCE = 0x4000000, + QED_MSG_RDMA = 0x4000000, QED_MSG_DEBUG = 0x8000000, /* to be added...up to 0x8000000 */ }; diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h new file mode 100644 index 000000000000..0f7d5275e515 --- /dev/null +++ b/include/linux/qed/qed_roce_if.h @@ -0,0 +1,345 @@ +/* QLogic qed NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _QED_ROCE_IF_H +#define _QED_ROCE_IF_H +#include +#include +#include +#include +#include +#include +#include +#include + +#define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) + +/* rdma interface */ +enum qed_rdma_tid_type { + QED_RDMA_TID_REGISTERED_MR, + QED_RDMA_TID_FMR, + QED_RDMA_TID_MW_TYPE1, + QED_RDMA_TID_MW_TYPE2A +}; + +struct qed_rdma_events { + void *context; + void (*affiliated_event)(void *context, u8 fw_event_code, + void *fw_handle); + void (*unaffiliated_event)(void *context, u8 event_code); +}; + +struct qed_rdma_device { + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + u64 fw_ver; + + u64 node_guid; + u64 sys_image_guid; + + u8 max_cnq; + u8 max_sge; + u8 max_srq_sge; + u16 max_inline; + u32 max_wqe; + u32 max_srq_wqe; + u8 max_qp_resp_rd_atomic_resc; + u8 max_qp_req_rd_atomic_resc; + u64 max_dev_resp_rd_atomic_resc; + u32 max_cq; + u32 max_qp; + u32 max_srq; + u32 max_mr; + u64 max_mr_size; + u32 max_cqe; + u32 max_mw; + u32 max_fmr; + u32 max_mr_mw_fmr_pbl; + u64 max_mr_mw_fmr_size; + u32 max_pd; + u32 max_ah; + u8 max_pkey; + u16 max_srq_wr; + u8 max_stats_queues; + u32 dev_caps; + + /* Abilty to support RNR-NAK generation */ + +#define QED_RDMA_DEV_CAP_RNR_NAK_MASK 0x1 +#define QED_RDMA_DEV_CAP_RNR_NAK_SHIFT 0 + /* Abilty to support shutdown port */ +#define QED_RDMA_DEV_CAP_SHUTDOWN_PORT_MASK 0x1 +#define QED_RDMA_DEV_CAP_SHUTDOWN_PORT_SHIFT 1 + /* Abilty to support port active event */ +#define QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT_MASK 0x1 +#define QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT_SHIFT 2 + /* Abilty to support port change event */ +#define QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT_MASK 0x1 +#define QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT_SHIFT 3 + /* Abilty to support system image GUID */ +#define QED_RDMA_DEV_CAP_SYS_IMAGE_MASK 0x1 +#define QED_RDMA_DEV_CAP_SYS_IMAGE_SHIFT 4 + /* Abilty to support bad P_Key counter support */ +#define QED_RDMA_DEV_CAP_BAD_PKEY_CNT_MASK 0x1 +#define QED_RDMA_DEV_CAP_BAD_PKEY_CNT_SHIFT 5 + /* Abilty to support atomic operations */ +#define QED_RDMA_DEV_CAP_ATOMIC_OP_MASK 0x1 +#define QED_RDMA_DEV_CAP_ATOMIC_OP_SHIFT 6 +#define QED_RDMA_DEV_CAP_RESIZE_CQ_MASK 0x1 +#define QED_RDMA_DEV_CAP_RESIZE_CQ_SHIFT 7 + /* Abilty to support modifying the maximum number of + * outstanding work requests per QP + */ +#define QED_RDMA_DEV_CAP_RESIZE_MAX_WR_MASK 0x1 +#define QED_RDMA_DEV_CAP_RESIZE_MAX_WR_SHIFT 8 + /* Abilty to support automatic path migration */ +#define QED_RDMA_DEV_CAP_AUTO_PATH_MIG_MASK 0x1 +#define QED_RDMA_DEV_CAP_AUTO_PATH_MIG_SHIFT 9 + /* Abilty to support the base memory management extensions */ +#define QED_RDMA_DEV_CAP_BASE_MEMORY_EXT_MASK 0x1 +#define QED_RDMA_DEV_CAP_BASE_MEMORY_EXT_SHIFT 10 +#define QED_RDMA_DEV_CAP_BASE_QUEUE_EXT_MASK 0x1 +#define QED_RDMA_DEV_CAP_BASE_QUEUE_EXT_SHIFT 11 + /* Abilty to support multipile page sizes per memory region */ +#define QED_RDMA_DEV_CAP_MULTI_PAGE_PER_MR_EXT_MASK 0x1 +#define QED_RDMA_DEV_CAP_MULTI_PAGE_PER_MR_EXT_SHIFT 12 + /* Abilty to support block list physical buffer list */ +#define QED_RDMA_DEV_CAP_BLOCK_MODE_MASK 0x1 +#define QED_RDMA_DEV_CAP_BLOCK_MODE_SHIFT 13 + /* Abilty to support zero based virtual addresses */ +#define QED_RDMA_DEV_CAP_ZBVA_MASK 0x1 +#define QED_RDMA_DEV_CAP_ZBVA_SHIFT 14 + /* Abilty to support local invalidate fencing */ +#define QED_RDMA_DEV_CAP_LOCAL_INV_FENCE_MASK 0x1 +#define QED_RDMA_DEV_CAP_LOCAL_INV_FENCE_SHIFT 15 + /* Abilty to support Loopback on QP */ +#define QED_RDMA_DEV_CAP_LB_INDICATOR_MASK 0x1 +#define QED_RDMA_DEV_CAP_LB_INDICATOR_SHIFT 16 + u64 page_size_caps; + u8 dev_ack_delay; + u32 reserved_lkey; + u32 bad_pkey_counter; + struct qed_rdma_events events; +}; + +enum qed_port_state { + QED_RDMA_PORT_UP, + QED_RDMA_PORT_DOWN, +}; + +enum qed_roce_capability { + QED_ROCE_V1 = 1 << 0, + QED_ROCE_V2 = 1 << 1, +}; + +struct qed_rdma_port { + enum qed_port_state port_state; + int link_speed; + u64 max_msg_size; + u8 source_gid_table_len; + void *source_gid_table_ptr; + u8 pkey_table_len; + void *pkey_table_ptr; + u32 pkey_bad_counter; + enum qed_roce_capability capability; +}; + +struct qed_rdma_cnq_params { + u8 num_pbl_pages; + u64 pbl_ptr; +}; + +/* The CQ Mode affects the CQ doorbell transaction size. + * 64/32 bit machines should configure to 32/16 bits respectively. + */ +enum qed_rdma_cq_mode { + QED_RDMA_CQ_MODE_16_BITS, + QED_RDMA_CQ_MODE_32_BITS, +}; + +struct qed_roce_dcqcn_params { + u8 notification_point; + u8 reaction_point; + + /* fields for notification point */ + u32 cnp_send_timeout; + + /* fields for reaction point */ + u32 rl_bc_rate; + u16 rl_max_rate; + u16 rl_r_ai; + u16 rl_r_hai; + u16 dcqcn_g; + u32 dcqcn_k_us; + u32 dcqcn_timeout_us; +}; + +struct qed_rdma_start_in_params { + struct qed_rdma_events *events; + struct qed_rdma_cnq_params cnq_pbl_list[128]; + u8 desired_cnq; + enum qed_rdma_cq_mode cq_mode; + struct qed_roce_dcqcn_params dcqcn_params; + u16 max_mtu; + u8 mac_addr[ETH_ALEN]; + u8 iwarp_flags; +}; + +struct qed_rdma_add_user_out_params { + u16 dpi; + u64 dpi_addr; + u64 dpi_phys_addr; + u32 dpi_size; +}; + +enum roce_mode { + ROCE_V1, + ROCE_V2_IPV4, + ROCE_V2_IPV6, + MAX_ROCE_MODE +}; + +union qed_gid { + u8 bytes[16]; + u16 words[8]; + u32 dwords[4]; + u64 qwords[2]; + u32 ipv4_addr; +}; + +struct qed_rdma_register_tid_in_params { + u32 itid; + enum qed_rdma_tid_type tid_type; + u8 key; + u16 pd; + bool local_read; + bool local_write; + bool remote_read; + bool remote_write; + bool remote_atomic; + bool mw_bind; + u64 pbl_ptr; + bool pbl_two_level; + u8 pbl_page_size_log; + u8 page_size_log; + u32 fbo; + u64 length; + u64 vaddr; + bool zbva; + bool phy_mr; + bool dma_mr; + + bool dif_enabled; + u64 dif_error_addr; + u64 dif_runt_addr; +}; + +struct qed_rdma_create_srq_in_params { + u64 pbl_base_addr; + u64 prod_pair_addr; + u16 num_pages; + u16 pd_id; + u16 page_size; +}; + +struct qed_rdma_create_srq_out_params { + u16 srq_id; +}; + +struct qed_rdma_destroy_srq_in_params { + u16 srq_id; +}; + +struct qed_rdma_modify_srq_in_params { + u32 wqe_limit; + u16 srq_id; +}; + +struct qed_rdma_stats_out_params { + u64 sent_bytes; + u64 sent_pkts; + u64 rcv_bytes; + u64 rcv_pkts; +}; + +struct qed_rdma_counters_out_params { + u64 pd_count; + u64 max_pd; + u64 dpi_count; + u64 max_dpi; + u64 cq_count; + u64 max_cq; + u64 qp_count; + u64 max_qp; + u64 tid_count; + u64 max_tid; +}; + +#define QED_ROCE_TX_HEAD_FAILURE (1) +#define QED_ROCE_TX_FRAG_FAILURE (2) + +enum qed_rdma_type { + QED_RDMA_TYPE_ROCE, +}; + +struct qed_dev_rdma_info { + struct qed_dev_info common; + enum qed_rdma_type rdma_type; +}; + +struct qed_rdma_ops { + const struct qed_common_ops *common; + + int (*fill_dev_info)(struct qed_dev *cdev, + struct qed_dev_rdma_info *info); + void *(*rdma_get_rdma_ctx)(struct qed_dev *cdev); + + int (*rdma_init)(struct qed_dev *dev, + struct qed_rdma_start_in_params *iparams); + + int (*rdma_add_user)(void *rdma_cxt, + struct qed_rdma_add_user_out_params *oparams); + + void (*rdma_remove_user)(void *rdma_cxt, u16 dpi); + int (*rdma_stop)(void *rdma_cxt); + struct qed_rdma_device* (*rdma_query_device)(void *rdma_cxt); + int (*rdma_get_start_sb)(struct qed_dev *cdev); + int (*rdma_get_min_cnq_msix)(struct qed_dev *cdev); + void (*rdma_cnq_prod_update)(void *rdma_cxt, u8 cnq_index, u16 prod); + int (*rdma_get_rdma_int)(struct qed_dev *cdev, + struct qed_int_info *info); + int (*rdma_set_rdma_int)(struct qed_dev *cdev, u16 cnt); +}; + +const struct qed_rdma_ops *qed_get_rdma_ops(void); + +#endif diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h index 187991c1f439..7663725faa94 100644 --- a/include/linux/qed/rdma_common.h +++ b/include/linux/qed/rdma_common.h @@ -28,6 +28,7 @@ #define RDMA_MAX_PDS (64 * 1024) #define RDMA_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS +#define RDMA_NUM_STATISTIC_COUNTERS_BB MAX_NUM_VPORTS_BB #define RDMA_TASK_TYPE (PROTOCOLID_ROCE) -- cgit v1.2.3 From c295f86e60f5ba67f0f4bba2bb2c22b3cbf01ec1 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:58 +0300 Subject: qed: PD,PKEY and CQ verb support Add support for the configurations of the protection domain and completion queues. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_roce.c | 324 +++++++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_roce.h | 21 ++ include/linux/qed/qed_roce_if.h | 30 +++ 3 files changed, 375 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 4c53b857cc1c..f9551643428f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -663,6 +663,22 @@ int qed_rdma_add_user(void *rdma_cxt, return rc; } +struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct qed_rdma_port *p_port = p_hwfn->p_rdma_info->port; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA Query port\n"); + + /* Link may have changed */ + p_port->port_state = p_hwfn->mcp_info->link_output.link_up ? + QED_RDMA_PORT_UP : QED_RDMA_PORT_DOWN; + + p_port->link_speed = p_hwfn->mcp_info->link_output.speed; + + return p_port; +} + struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; @@ -788,6 +804,309 @@ static int qed_rdma_get_int(struct qed_dev *cdev, struct qed_int_info *info) return 0; } +int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + u32 returned_id; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Alloc PD\n"); + + /* Allocates an unused protection domain */ + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, + &p_hwfn->p_rdma_info->pd_map, &returned_id); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + + *pd = (u16)returned_id; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Alloc PD - done, rc = %d\n", rc); + return rc; +} + +void qed_rdma_free_pd(void *rdma_cxt, u16 pd) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "pd = %08x\n", pd); + + /* Returns a previously allocated protection domain for reuse */ + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->pd_map, pd); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); +} + +static enum qed_rdma_toggle_bit +qed_rdma_toggle_bit_create_resize_cq(struct qed_hwfn *p_hwfn, u16 icid) +{ + struct qed_rdma_info *p_info = p_hwfn->p_rdma_info; + enum qed_rdma_toggle_bit toggle_bit; + u32 bmap_id; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", icid); + + /* the function toggle the bit that is related to a given icid + * and returns the new toggle bit's value + */ + bmap_id = icid - qed_cxt_get_proto_cid_start(p_hwfn, p_info->proto); + + spin_lock_bh(&p_info->lock); + toggle_bit = !test_and_change_bit(bmap_id, + p_info->toggle_bits.bitmap); + spin_unlock_bh(&p_info->lock); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QED_RDMA_TOGGLE_BIT_= %d\n", + toggle_bit); + + return toggle_bit; +} + +int qed_rdma_create_cq(void *rdma_cxt, + struct qed_rdma_create_cq_in_params *params, u16 *icid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct qed_rdma_info *p_info = p_hwfn->p_rdma_info; + struct rdma_create_cq_ramrod_data *p_ramrod; + enum qed_rdma_toggle_bit toggle_bit; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + u32 returned_id, start_cid; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "cq_handle = %08x%08x\n", + params->cq_handle_hi, params->cq_handle_lo); + + /* Allocate icid */ + spin_lock_bh(&p_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, + &p_info->cq_map, &returned_id); + spin_unlock_bh(&p_info->lock); + + if (rc) { + DP_NOTICE(p_hwfn, "Can't create CQ, rc = %d\n", rc); + return rc; + } + + start_cid = qed_cxt_get_proto_cid_start(p_hwfn, + p_info->proto); + *icid = returned_id + start_cid; + + /* Check if icid requires a page allocation */ + rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, *icid); + if (rc) + goto err; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = *icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + /* Send create CQ ramrod */ + rc = qed_sp_init_request(p_hwfn, &p_ent, + RDMA_RAMROD_CREATE_CQ, + p_info->proto, &init_data); + if (rc) + goto err; + + p_ramrod = &p_ent->ramrod.rdma_create_cq; + + p_ramrod->cq_handle.hi = cpu_to_le32(params->cq_handle_hi); + p_ramrod->cq_handle.lo = cpu_to_le32(params->cq_handle_lo); + p_ramrod->dpi = cpu_to_le16(params->dpi); + p_ramrod->is_two_level_pbl = params->pbl_two_level; + p_ramrod->max_cqes = cpu_to_le32(params->cq_size); + DMA_REGPAIR_LE(p_ramrod->pbl_addr, params->pbl_ptr); + p_ramrod->pbl_num_pages = cpu_to_le16(params->pbl_num_pages); + p_ramrod->cnq_id = (u8)RESC_START(p_hwfn, QED_RDMA_CNQ_RAM) + + params->cnq_id; + p_ramrod->int_timeout = params->int_timeout; + + /* toggle the bit for every resize or create cq for a given icid */ + toggle_bit = qed_rdma_toggle_bit_create_resize_cq(p_hwfn, *icid); + + p_ramrod->toggle_bit = toggle_bit; + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + if (rc) { + /* restore toggle bit */ + qed_rdma_toggle_bit_create_resize_cq(p_hwfn, *icid); + goto err; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Created CQ, rc = %d\n", rc); + return rc; + +err: + /* release allocated icid */ + qed_bmap_release_id(p_hwfn, &p_info->cq_map, returned_id); + DP_NOTICE(p_hwfn, "Create CQ failed, rc = %d\n", rc); + + return rc; +} + +int qed_rdma_resize_cq(void *rdma_cxt, + struct qed_rdma_resize_cq_in_params *in_params, + struct qed_rdma_resize_cq_out_params *out_params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct rdma_resize_cq_output_params *p_ramrod_res; + struct rdma_resize_cq_ramrod_data *p_ramrod; + enum qed_rdma_toggle_bit toggle_bit; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + dma_addr_t ramrod_res_phys; + u8 fw_return_code; + int rc = -ENOMEM; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", in_params->icid); + + p_ramrod_res = + (struct rdma_resize_cq_output_params *) + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct rdma_resize_cq_output_params), + &ramrod_res_phys, GFP_KERNEL); + if (!p_ramrod_res) { + DP_NOTICE(p_hwfn, + "qed resize cq failed: cannot allocate memory (ramrod)\n"); + return rc; + } + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = in_params->icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + RDMA_RAMROD_RESIZE_CQ, + p_hwfn->p_rdma_info->proto, &init_data); + if (rc) + goto err; + + p_ramrod = &p_ent->ramrod.rdma_resize_cq; + + p_ramrod->flags = 0; + + /* toggle the bit for every resize or create cq for a given icid */ + toggle_bit = qed_rdma_toggle_bit_create_resize_cq(p_hwfn, + in_params->icid); + + SET_FIELD(p_ramrod->flags, + RDMA_RESIZE_CQ_RAMROD_DATA_TOGGLE_BIT, toggle_bit); + + SET_FIELD(p_ramrod->flags, + RDMA_RESIZE_CQ_RAMROD_DATA_IS_TWO_LEVEL_PBL, + in_params->pbl_two_level); + + p_ramrod->pbl_log_page_size = in_params->pbl_page_size_log - 12; + p_ramrod->pbl_num_pages = cpu_to_le16(in_params->pbl_num_pages); + p_ramrod->max_cqes = cpu_to_le32(in_params->cq_size); + DMA_REGPAIR_LE(p_ramrod->pbl_addr, in_params->pbl_ptr); + DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys); + + rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code); + if (rc) + goto err; + + if (fw_return_code != RDMA_RETURN_OK) { + DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code); + rc = -EINVAL; + goto err; + } + + out_params->prod = le32_to_cpu(p_ramrod_res->old_cq_prod); + out_params->cons = le32_to_cpu(p_ramrod_res->old_cq_cons); + + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct rdma_resize_cq_output_params), + p_ramrod_res, ramrod_res_phys); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Resized CQ, rc = %d\n", rc); + + return rc; + +err: dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct rdma_resize_cq_output_params), + p_ramrod_res, ramrod_res_phys); + DP_NOTICE(p_hwfn, "Resized CQ, Failed - rc = %d\n", rc); + + return rc; +} + +int qed_rdma_destroy_cq(void *rdma_cxt, + struct qed_rdma_destroy_cq_in_params *in_params, + struct qed_rdma_destroy_cq_out_params *out_params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct rdma_destroy_cq_output_params *p_ramrod_res; + struct rdma_destroy_cq_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + dma_addr_t ramrod_res_phys; + int rc = -ENOMEM; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", in_params->icid); + + p_ramrod_res = + (struct rdma_destroy_cq_output_params *) + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct rdma_destroy_cq_output_params), + &ramrod_res_phys, GFP_KERNEL); + if (!p_ramrod_res) { + DP_NOTICE(p_hwfn, + "qed destroy cq failed: cannot allocate memory (ramrod)\n"); + return rc; + } + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = in_params->icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + /* Send destroy CQ ramrod */ + rc = qed_sp_init_request(p_hwfn, &p_ent, + RDMA_RAMROD_DESTROY_CQ, + p_hwfn->p_rdma_info->proto, &init_data); + if (rc) + goto err; + + p_ramrod = &p_ent->ramrod.rdma_destroy_cq; + DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys); + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + if (rc) + goto err; + + out_params->num_cq_notif = le16_to_cpu(p_ramrod_res->cnq_num); + + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct rdma_destroy_cq_output_params), + p_ramrod_res, ramrod_res_phys); + + /* Free icid */ + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + + qed_bmap_release_id(p_hwfn, + &p_hwfn->p_rdma_info->cq_map, + (in_params->icid - + qed_cxt_get_proto_cid_start(p_hwfn, + p_hwfn-> + p_rdma_info->proto))); + + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Destroyed CQ, rc = %d\n", rc); + return rc; + +err: dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct rdma_destroy_cq_output_params), + p_ramrod_res, ramrod_res_phys); + + return rc; +} + static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev) { return QED_LEADING_HWFN(cdev); @@ -871,12 +1190,17 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = { .rdma_add_user = &qed_rdma_add_user, .rdma_remove_user = &qed_rdma_remove_user, .rdma_stop = &qed_rdma_stop, + .rdma_query_port = &qed_rdma_query_port, .rdma_query_device = &qed_rdma_query_device, .rdma_get_start_sb = &qed_rdma_get_sb_start, .rdma_get_rdma_int = &qed_rdma_get_int, .rdma_set_rdma_int = &qed_rdma_set_int, .rdma_get_min_cnq_msix = &qed_rdma_get_min_cnq_msix, .rdma_cnq_prod_update = &qed_rdma_cnq_prod_update, + .rdma_alloc_pd = &qed_rdma_alloc_pd, + .rdma_dealloc_pd = &qed_rdma_free_pd, + .rdma_create_cq = &qed_rdma_create_cq, + .rdma_destroy_cq = &qed_rdma_destroy_cq, }; const struct qed_rdma_ops *qed_get_rdma_ops() diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h index e55048106a83..1fe73707e0b5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.h +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h @@ -94,6 +94,26 @@ struct qed_rdma_info { enum protocol_type proto; }; +struct qed_rdma_resize_cq_in_params { + u16 icid; + u32 cq_size; + bool pbl_two_level; + u64 pbl_ptr; + u16 pbl_num_pages; + u8 pbl_page_size_log; +}; + +struct qed_rdma_resize_cq_out_params { + u32 prod; + u32 cons; +}; + +struct qed_rdma_resize_cnq_in_params { + u32 cnq_id; + u32 pbl_page_size_log; + u64 pbl_ptr; +}; + int qed_rdma_add_user(void *rdma_cxt, struct qed_rdma_add_user_out_params *out_params); @@ -102,6 +122,7 @@ int qed_rdma_alloc_tid(void *rdma_cxt, u32 *tid); int qed_rdma_deregister_tid(void *rdma_cxt, u32 tid); void qed_rdma_free_tid(void *rdma_cxt, u32 tid); struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt); +struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt); int qed_rdma_register_tid(void *rdma_cxt, struct qed_rdma_register_tid_in_params *params); diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index 0f7d5275e515..b559b1c9e76d 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -263,6 +263,19 @@ struct qed_rdma_register_tid_in_params { u64 dif_runt_addr; }; +struct qed_rdma_create_cq_in_params { + u32 cq_handle_lo; + u32 cq_handle_hi; + u32 cq_size; + u16 dpi; + bool pbl_two_level; + u64 pbl_ptr; + u16 pbl_num_pages; + u8 pbl_page_size_log; + u8 cnq_id; + u16 int_timeout; +}; + struct qed_rdma_create_srq_in_params { u64 pbl_base_addr; u64 prod_pair_addr; @@ -271,6 +284,14 @@ struct qed_rdma_create_srq_in_params { u16 page_size; }; +struct qed_rdma_destroy_cq_in_params { + u16 icid; +}; + +struct qed_rdma_destroy_cq_out_params { + u16 num_cq_notif; +}; + struct qed_rdma_create_srq_out_params { u16 srq_id; }; @@ -332,12 +353,21 @@ struct qed_rdma_ops { void (*rdma_remove_user)(void *rdma_cxt, u16 dpi); int (*rdma_stop)(void *rdma_cxt); struct qed_rdma_device* (*rdma_query_device)(void *rdma_cxt); + struct qed_rdma_port* (*rdma_query_port)(void *rdma_cxt); int (*rdma_get_start_sb)(struct qed_dev *cdev); int (*rdma_get_min_cnq_msix)(struct qed_dev *cdev); void (*rdma_cnq_prod_update)(void *rdma_cxt, u8 cnq_index, u16 prod); int (*rdma_get_rdma_int)(struct qed_dev *cdev, struct qed_int_info *info); int (*rdma_set_rdma_int)(struct qed_dev *cdev, u16 cnt); + int (*rdma_alloc_pd)(void *rdma_cxt, u16 *pd); + void (*rdma_dealloc_pd)(void *rdma_cxt, u16 pd); + int (*rdma_create_cq)(void *rdma_cxt, + struct qed_rdma_create_cq_in_params *params, + u16 *icid); + int (*rdma_destroy_cq)(void *rdma_cxt, + struct qed_rdma_destroy_cq_in_params *iparams, + struct qed_rdma_destroy_cq_out_params *oparams); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From f109394033521862f2558df93d9afc4dfa829c6a Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:59 +0300 Subject: qed: Add support for QP verbs Add support for the slowpath configurations of Queue Pair verbs which adds, deletes, modifies and queries Queue Pairs. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_cxt.h | 1 + drivers/net/ethernet/qlogic/qed/qed_roce.c | 1197 ++++++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_roce.h | 71 ++ include/linux/qed/qed_roce_if.h | 144 ++++ 4 files changed, 1413 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h index d00ad055802b..2b8bdaa77800 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h @@ -176,6 +176,7 @@ u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn, enum protocol_type type); u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn, enum protocol_type type); +int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto); #define QED_CTX_WORKING_MEM 0 #define QED_CTX_FL_MEM 1 diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index f9551643428f..438a0badc4e1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -1107,6 +1107,1199 @@ err: dma_free_coherent(&p_hwfn->cdev->pdev->dev, return rc; } +static void qed_rdma_set_fw_mac(u16 *p_fw_mac, u8 *p_qed_mac) +{ + p_fw_mac[0] = cpu_to_le16((p_qed_mac[0] << 8) + p_qed_mac[1]); + p_fw_mac[1] = cpu_to_le16((p_qed_mac[2] << 8) + p_qed_mac[3]); + p_fw_mac[2] = cpu_to_le16((p_qed_mac[4] << 8) + p_qed_mac[5]); +} + +static void qed_rdma_copy_gids(struct qed_rdma_qp *qp, __le32 *src_gid, + __le32 *dst_gid) +{ + u32 i; + + if (qp->roce_mode == ROCE_V2_IPV4) { + /* The IPv4 addresses shall be aligned to the highest word. + * The lower words must be zero. + */ + memset(src_gid, 0, sizeof(union qed_gid)); + memset(dst_gid, 0, sizeof(union qed_gid)); + src_gid[3] = cpu_to_le32(qp->sgid.ipv4_addr); + dst_gid[3] = cpu_to_le32(qp->dgid.ipv4_addr); + } else { + /* GIDs and IPv6 addresses coincide in location and size */ + for (i = 0; i < ARRAY_SIZE(qp->sgid.dwords); i++) { + src_gid[i] = cpu_to_le32(qp->sgid.dwords[i]); + dst_gid[i] = cpu_to_le32(qp->dgid.dwords[i]); + } + } +} + +static enum roce_flavor qed_roce_mode_to_flavor(enum roce_mode roce_mode) +{ + enum roce_flavor flavor; + + switch (roce_mode) { + case ROCE_V1: + flavor = PLAIN_ROCE; + break; + case ROCE_V2_IPV4: + flavor = RROCE_IPV4; + break; + case ROCE_V2_IPV6: + flavor = ROCE_V2_IPV6; + break; + default: + flavor = MAX_ROCE_MODE; + break; + } + return flavor; +} + +int qed_roce_alloc_cid(struct qed_hwfn *p_hwfn, u16 *cid) +{ + struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info; + u32 responder_icid; + u32 requester_icid; + int rc; + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_rdma_info->cid_map, + &responder_icid); + if (rc) { + spin_unlock_bh(&p_rdma_info->lock); + return rc; + } + + rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_rdma_info->cid_map, + &requester_icid); + + spin_unlock_bh(&p_rdma_info->lock); + if (rc) + goto err; + + /* the two icid's should be adjacent */ + if ((requester_icid - responder_icid) != 1) { + DP_NOTICE(p_hwfn, "Failed to allocate two adjacent qp's'\n"); + rc = -EINVAL; + goto err; + } + + responder_icid += qed_cxt_get_proto_cid_start(p_hwfn, + p_rdma_info->proto); + requester_icid += qed_cxt_get_proto_cid_start(p_hwfn, + p_rdma_info->proto); + + /* If these icids require a new ILT line allocate DMA-able context for + * an ILT page + */ + rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, responder_icid); + if (rc) + goto err; + + rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, requester_icid); + if (rc) + goto err; + + *cid = (u16)responder_icid; + return rc; + +err: + spin_lock_bh(&p_rdma_info->lock); + qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, responder_icid); + qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, requester_icid); + + spin_unlock_bh(&p_rdma_info->lock); + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Allocate CID - failed, rc = %d\n", rc); + return rc; +} + +static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp) +{ + struct roce_create_qp_resp_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + union qed_qm_pq_params qm_params; + enum roce_flavor roce_flavor; + struct qed_spq_entry *p_ent; + u16 physical_queue0 = 0; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + /* Allocate DMA-able memory for IRQ */ + qp->irq_num_pages = 1; + qp->irq = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + RDMA_RING_PAGE_SIZE, + &qp->irq_phys_addr, GFP_KERNEL); + if (!qp->irq) { + rc = -ENOMEM; + DP_NOTICE(p_hwfn, + "qed create responder failed: cannot allocate memory (irq). rc = %d\n", + rc); + return rc; + } + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_CREATE_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) + goto err; + + p_ramrod = &p_ent->ramrod.roce_create_qp_resp; + + p_ramrod->flags = 0; + + roce_flavor = qed_roce_mode_to_flavor(qp->roce_mode); + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_ROCE_FLAVOR, roce_flavor); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_RD_EN, + qp->incoming_rdma_read_en); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_WR_EN, + qp->incoming_rdma_write_en); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_ATOMIC_EN, + qp->incoming_atomic_en); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN, + qp->e2e_flow_control_en); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_SRQ_FLG, qp->use_srq); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_KEY_EN, + qp->fmr_and_reserved_lkey); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER, + qp->min_rnr_nak_timer); + + p_ramrod->max_ird = qp->max_rd_atomic_resp; + p_ramrod->traffic_class = qp->traffic_class_tos; + p_ramrod->hop_limit = qp->hop_limit_ttl; + p_ramrod->irq_num_pages = qp->irq_num_pages; + p_ramrod->p_key = cpu_to_le16(qp->pkey); + p_ramrod->flow_label = cpu_to_le32(qp->flow_label); + p_ramrod->dst_qp_id = cpu_to_le32(qp->dest_qp); + p_ramrod->mtu = cpu_to_le16(qp->mtu); + p_ramrod->initial_psn = cpu_to_le32(qp->rq_psn); + p_ramrod->pd = cpu_to_le16(qp->pd); + p_ramrod->rq_num_pages = cpu_to_le16(qp->rq_num_pages); + DMA_REGPAIR_LE(p_ramrod->rq_pbl_addr, qp->rq_pbl_ptr); + DMA_REGPAIR_LE(p_ramrod->irq_pbl_addr, qp->irq_phys_addr); + qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid); + p_ramrod->qp_handle_for_async.hi = cpu_to_le32(qp->qp_handle_async.hi); + p_ramrod->qp_handle_for_async.lo = cpu_to_le32(qp->qp_handle_async.lo); + p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi); + p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo); + p_ramrod->stats_counter_id = p_hwfn->rel_pf_id; + p_ramrod->cq_cid = cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | + qp->rq_cq_id); + + memset(&qm_params, 0, sizeof(qm_params)); + qm_params.roce.qpid = qp->icid >> 1; + physical_queue0 = qed_get_qm_pq(p_hwfn, PROTOCOLID_ROCE, &qm_params); + + p_ramrod->physical_queue0 = cpu_to_le16(physical_queue0); + p_ramrod->dpi = cpu_to_le16(qp->dpi); + + qed_rdma_set_fw_mac(p_ramrod->remote_mac_addr, qp->remote_mac_addr); + qed_rdma_set_fw_mac(p_ramrod->local_mac_addr, qp->local_mac_addr); + + p_ramrod->udp_src_port = qp->udp_src_port; + p_ramrod->vlan_id = cpu_to_le16(qp->vlan_id); + p_ramrod->srq_id.srq_idx = cpu_to_le16(qp->srq_id); + p_ramrod->srq_id.opaque_fid = cpu_to_le16(p_hwfn->hw_info.opaque_fid); + + p_ramrod->stats_counter_id = RESC_START(p_hwfn, QED_RDMA_STATS_QUEUE) + + qp->stats_queue; + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d physical_queue0 = 0x%x\n", + rc, physical_queue0); + + if (rc) + goto err; + + qp->resp_offloaded = true; + + return rc; + +err: + DP_NOTICE(p_hwfn, "create responder - failed, rc = %d\n", rc); + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + qp->irq_num_pages * RDMA_RING_PAGE_SIZE, + qp->irq, qp->irq_phys_addr); + + return rc; +} + +static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp) +{ + struct roce_create_qp_req_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + union qed_qm_pq_params qm_params; + enum roce_flavor roce_flavor; + struct qed_spq_entry *p_ent; + u16 physical_queue0 = 0; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + /* Allocate DMA-able memory for ORQ */ + qp->orq_num_pages = 1; + qp->orq = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + RDMA_RING_PAGE_SIZE, + &qp->orq_phys_addr, GFP_KERNEL); + if (!qp->orq) { + rc = -ENOMEM; + DP_NOTICE(p_hwfn, + "qed create requester failed: cannot allocate memory (orq). rc = %d\n", + rc); + return rc; + } + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid + 1; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + ROCE_RAMROD_CREATE_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) + goto err; + + p_ramrod = &p_ent->ramrod.roce_create_qp_req; + + p_ramrod->flags = 0; + + roce_flavor = qed_roce_mode_to_flavor(qp->roce_mode); + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_REQ_RAMROD_DATA_ROCE_FLAVOR, roce_flavor); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_REQ_RAMROD_DATA_FMR_AND_RESERVED_EN, + qp->fmr_and_reserved_lkey); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_REQ_RAMROD_DATA_SIGNALED_COMP, qp->signal_all); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT, qp->retry_cnt); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_REQ_RAMROD_DATA_RNR_NAK_CNT, + qp->rnr_retry_cnt); + + p_ramrod->max_ord = qp->max_rd_atomic_req; + p_ramrod->traffic_class = qp->traffic_class_tos; + p_ramrod->hop_limit = qp->hop_limit_ttl; + p_ramrod->orq_num_pages = qp->orq_num_pages; + p_ramrod->p_key = cpu_to_le16(qp->pkey); + p_ramrod->flow_label = cpu_to_le32(qp->flow_label); + p_ramrod->dst_qp_id = cpu_to_le32(qp->dest_qp); + p_ramrod->ack_timeout_val = cpu_to_le32(qp->ack_timeout); + p_ramrod->mtu = cpu_to_le16(qp->mtu); + p_ramrod->initial_psn = cpu_to_le32(qp->sq_psn); + p_ramrod->pd = cpu_to_le16(qp->pd); + p_ramrod->sq_num_pages = cpu_to_le16(qp->sq_num_pages); + DMA_REGPAIR_LE(p_ramrod->sq_pbl_addr, qp->sq_pbl_ptr); + DMA_REGPAIR_LE(p_ramrod->orq_pbl_addr, qp->orq_phys_addr); + qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid); + p_ramrod->qp_handle_for_async.hi = cpu_to_le32(qp->qp_handle_async.hi); + p_ramrod->qp_handle_for_async.lo = cpu_to_le32(qp->qp_handle_async.lo); + p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi); + p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo); + p_ramrod->stats_counter_id = p_hwfn->rel_pf_id; + p_ramrod->cq_cid = cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | + qp->sq_cq_id); + + memset(&qm_params, 0, sizeof(qm_params)); + qm_params.roce.qpid = qp->icid >> 1; + physical_queue0 = qed_get_qm_pq(p_hwfn, PROTOCOLID_ROCE, &qm_params); + + p_ramrod->physical_queue0 = cpu_to_le16(physical_queue0); + p_ramrod->dpi = cpu_to_le16(qp->dpi); + + qed_rdma_set_fw_mac(p_ramrod->remote_mac_addr, qp->remote_mac_addr); + qed_rdma_set_fw_mac(p_ramrod->local_mac_addr, qp->local_mac_addr); + + p_ramrod->udp_src_port = qp->udp_src_port; + p_ramrod->vlan_id = cpu_to_le16(qp->vlan_id); + p_ramrod->stats_counter_id = RESC_START(p_hwfn, QED_RDMA_STATS_QUEUE) + + qp->stats_queue; + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc); + + if (rc) + goto err; + + qp->req_offloaded = true; + + return rc; + +err: + DP_NOTICE(p_hwfn, "Create requested - failed, rc = %d\n", rc); + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + qp->orq_num_pages * RDMA_RING_PAGE_SIZE, + qp->orq, qp->orq_phys_addr); + return rc; +} + +static int qed_roce_sp_modify_responder(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp, + bool move_to_err, u32 modify_flags) +{ + struct roce_modify_qp_resp_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + if (move_to_err && !qp->resp_offloaded) + return 0; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + ROCE_EVENT_MODIFY_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) { + DP_NOTICE(p_hwfn, "rc = %d\n", rc); + return rc; + } + + p_ramrod = &p_ent->ramrod.roce_modify_qp_resp; + + p_ramrod->flags = 0; + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_MOVE_TO_ERR_FLG, move_to_err); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_RD_EN, + qp->incoming_rdma_read_en); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_WR_EN, + qp->incoming_rdma_write_en); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_ATOMIC_EN, + qp->incoming_atomic_en); + + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN, + qp->e2e_flow_control_en); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_OPS_EN_FLG, + GET_FIELD(modify_flags, + QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_P_KEY_FLG, + GET_FIELD(modify_flags, QED_ROCE_MODIFY_QP_VALID_PKEY)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_ADDRESS_VECTOR_FLG, + GET_FIELD(modify_flags, + QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_MAX_IRD_FLG, + GET_FIELD(modify_flags, + QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_FLG, + GET_FIELD(modify_flags, + QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER)); + + p_ramrod->fields = 0; + SET_FIELD(p_ramrod->fields, + ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER, + qp->min_rnr_nak_timer); + + p_ramrod->max_ird = qp->max_rd_atomic_resp; + p_ramrod->traffic_class = qp->traffic_class_tos; + p_ramrod->hop_limit = qp->hop_limit_ttl; + p_ramrod->p_key = cpu_to_le16(qp->pkey); + p_ramrod->flow_label = cpu_to_le32(qp->flow_label); + p_ramrod->mtu = cpu_to_le16(qp->mtu); + qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid); + rc = qed_spq_post(p_hwfn, p_ent, NULL); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Modify responder, rc = %d\n", rc); + return rc; +} + +static int qed_roce_sp_modify_requester(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp, + bool move_to_sqd, + bool move_to_err, u32 modify_flags) +{ + struct roce_modify_qp_req_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + if (move_to_err && !(qp->req_offloaded)) + return 0; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid + 1; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + ROCE_EVENT_MODIFY_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) { + DP_NOTICE(p_hwfn, "rc = %d\n", rc); + return rc; + } + + p_ramrod = &p_ent->ramrod.roce_modify_qp_req; + + p_ramrod->flags = 0; + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_ERR_FLG, move_to_err); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_SQD_FLG, move_to_sqd); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_EN_SQD_ASYNC_NOTIFY, + qp->sqd_async); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_P_KEY_FLG, + GET_FIELD(modify_flags, QED_ROCE_MODIFY_QP_VALID_PKEY)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_ADDRESS_VECTOR_FLG, + GET_FIELD(modify_flags, + QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_MAX_ORD_FLG, + GET_FIELD(modify_flags, + QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_FLG, + GET_FIELD(modify_flags, + QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_FLG, + GET_FIELD(modify_flags, QED_ROCE_MODIFY_QP_VALID_RETRY_CNT)); + + SET_FIELD(p_ramrod->flags, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_ACK_TIMEOUT_FLG, + GET_FIELD(modify_flags, + QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT)); + + p_ramrod->fields = 0; + SET_FIELD(p_ramrod->fields, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT, qp->retry_cnt); + + SET_FIELD(p_ramrod->fields, + ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT, + qp->rnr_retry_cnt); + + p_ramrod->max_ord = qp->max_rd_atomic_req; + p_ramrod->traffic_class = qp->traffic_class_tos; + p_ramrod->hop_limit = qp->hop_limit_ttl; + p_ramrod->p_key = cpu_to_le16(qp->pkey); + p_ramrod->flow_label = cpu_to_le32(qp->flow_label); + p_ramrod->ack_timeout_val = cpu_to_le32(qp->ack_timeout); + p_ramrod->mtu = cpu_to_le16(qp->mtu); + qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid); + rc = qed_spq_post(p_hwfn, p_ent, NULL); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Modify requester, rc = %d\n", rc); + return rc; +} + +static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp, + u32 *num_invalidated_mw) +{ + struct roce_destroy_qp_resp_output_params *p_ramrod_res; + struct roce_destroy_qp_resp_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + dma_addr_t ramrod_res_phys; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + if (!qp->resp_offloaded) + return 0; + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, + ROCE_RAMROD_DESTROY_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) + return rc; + + p_ramrod = &p_ent->ramrod.roce_destroy_qp_resp; + + p_ramrod_res = (struct roce_destroy_qp_resp_output_params *) + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_ramrod_res), + &ramrod_res_phys, GFP_KERNEL); + + if (!p_ramrod_res) { + rc = -ENOMEM; + DP_NOTICE(p_hwfn, + "qed destroy responder failed: cannot allocate memory (ramrod). rc = %d\n", + rc); + return rc; + } + + DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys); + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + if (rc) + goto err; + + *num_invalidated_mw = le32_to_cpu(p_ramrod_res->num_invalidated_mw); + + /* Free IRQ - only if ramrod succeeded, in case FW is still using it */ + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + qp->irq_num_pages * RDMA_RING_PAGE_SIZE, + qp->irq, qp->irq_phys_addr); + + qp->resp_offloaded = false; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Destroy responder, rc = %d\n", rc); + +err: + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct roce_destroy_qp_resp_output_params), + p_ramrod_res, ramrod_res_phys); + + return rc; +} + +static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp, + u32 *num_bound_mw) +{ + struct roce_destroy_qp_req_output_params *p_ramrod_res; + struct roce_destroy_qp_req_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + dma_addr_t ramrod_res_phys; + int rc = -ENOMEM; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + if (!qp->req_offloaded) + return 0; + + p_ramrod_res = (struct roce_destroy_qp_req_output_params *) + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(*p_ramrod_res), + &ramrod_res_phys, GFP_KERNEL); + if (!p_ramrod_res) { + DP_NOTICE(p_hwfn, + "qed destroy requester failed: cannot allocate memory (ramrod)\n"); + return rc; + } + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid + 1; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_DESTROY_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) + goto err; + + p_ramrod = &p_ent->ramrod.roce_destroy_qp_req; + DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys); + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + if (rc) + goto err; + + *num_bound_mw = le32_to_cpu(p_ramrod_res->num_bound_mw); + + /* Free ORQ - only if ramrod succeeded, in case FW is still using it */ + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + qp->orq_num_pages * RDMA_RING_PAGE_SIZE, + qp->orq, qp->orq_phys_addr); + + qp->req_offloaded = false; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Destroy requester, rc = %d\n", rc); + +err: + dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_ramrod_res), + p_ramrod_res, ramrod_res_phys); + + return rc; +} + +int qed_roce_query_qp(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *out_params) +{ + struct roce_query_qp_resp_output_params *p_resp_ramrod_res; + struct roce_query_qp_req_output_params *p_req_ramrod_res; + struct roce_query_qp_resp_ramrod_data *p_resp_ramrod; + struct roce_query_qp_req_ramrod_data *p_req_ramrod; + struct qed_sp_init_data init_data; + dma_addr_t resp_ramrod_res_phys; + dma_addr_t req_ramrod_res_phys; + struct qed_spq_entry *p_ent; + bool rq_err_state; + bool sq_err_state; + bool sq_draining; + int rc = -ENOMEM; + + if ((!(qp->resp_offloaded)) && (!(qp->req_offloaded))) { + /* We can't send ramrod to the fw since this qp wasn't offloaded + * to the fw yet + */ + out_params->draining = false; + out_params->rq_psn = qp->rq_psn; + out_params->sq_psn = qp->sq_psn; + out_params->state = qp->cur_state; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "No QPs as no offload\n"); + return 0; + } + + if (!(qp->resp_offloaded)) { + DP_NOTICE(p_hwfn, + "The responder's qp should be offloded before requester's\n"); + return -EINVAL; + } + + /* Send a query responder ramrod to FW to get RQ-PSN and state */ + p_resp_ramrod_res = (struct roce_query_qp_resp_output_params *) + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(*p_resp_ramrod_res), + &resp_ramrod_res_phys, GFP_KERNEL); + if (!p_resp_ramrod_res) { + DP_NOTICE(p_hwfn, + "qed query qp failed: cannot allocate memory (ramrod)\n"); + return rc; + } + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.cid = qp->icid; + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_QUERY_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) + goto err_resp; + + p_resp_ramrod = &p_ent->ramrod.roce_query_qp_resp; + DMA_REGPAIR_LE(p_resp_ramrod->output_params_addr, resp_ramrod_res_phys); + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + if (rc) + goto err_resp; + + dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_resp_ramrod_res), + p_resp_ramrod_res, resp_ramrod_res_phys); + + out_params->rq_psn = le32_to_cpu(p_resp_ramrod_res->psn); + rq_err_state = GET_FIELD(le32_to_cpu(p_resp_ramrod_res->err_flag), + ROCE_QUERY_QP_RESP_OUTPUT_PARAMS_ERROR_FLG); + + if (!(qp->req_offloaded)) { + /* Don't send query qp for the requester */ + out_params->sq_psn = qp->sq_psn; + out_params->draining = false; + + if (rq_err_state) + qp->cur_state = QED_ROCE_QP_STATE_ERR; + + out_params->state = qp->cur_state; + + return 0; + } + + /* Send a query requester ramrod to FW to get SQ-PSN and state */ + p_req_ramrod_res = (struct roce_query_qp_req_output_params *) + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(*p_req_ramrod_res), + &req_ramrod_res_phys, + GFP_KERNEL); + if (!p_req_ramrod_res) { + rc = -ENOMEM; + DP_NOTICE(p_hwfn, + "qed query qp failed: cannot allocate memory (ramrod)\n"); + return rc; + } + + /* Get SPQ entry */ + init_data.cid = qp->icid + 1; + rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_QUERY_QP, + PROTOCOLID_ROCE, &init_data); + if (rc) + goto err_req; + + p_req_ramrod = &p_ent->ramrod.roce_query_qp_req; + DMA_REGPAIR_LE(p_req_ramrod->output_params_addr, req_ramrod_res_phys); + + rc = qed_spq_post(p_hwfn, p_ent, NULL); + if (rc) + goto err_req; + + dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_req_ramrod_res), + p_req_ramrod_res, req_ramrod_res_phys); + + out_params->sq_psn = le32_to_cpu(p_req_ramrod_res->psn); + sq_err_state = GET_FIELD(le32_to_cpu(p_req_ramrod_res->flags), + ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_ERR_FLG); + sq_draining = + GET_FIELD(le32_to_cpu(p_req_ramrod_res->flags), + ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_SQ_DRAINING_FLG); + + out_params->draining = false; + + if (rq_err_state) + qp->cur_state = QED_ROCE_QP_STATE_ERR; + else if (sq_err_state) + qp->cur_state = QED_ROCE_QP_STATE_SQE; + else if (sq_draining) + out_params->draining = true; + out_params->state = qp->cur_state; + + return 0; + +err_req: + dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_req_ramrod_res), + p_req_ramrod_res, req_ramrod_res_phys); + return rc; +err_resp: + dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_resp_ramrod_res), + p_resp_ramrod_res, resp_ramrod_res_phys); + return rc; +} + +int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp) +{ + u32 num_invalidated_mw = 0; + u32 num_bound_mw = 0; + u32 start_cid; + int rc; + + /* Destroys the specified QP */ + if ((qp->cur_state != QED_ROCE_QP_STATE_RESET) && + (qp->cur_state != QED_ROCE_QP_STATE_ERR) && + (qp->cur_state != QED_ROCE_QP_STATE_INIT)) { + DP_NOTICE(p_hwfn, + "QP must be in error, reset or init state before destroying it\n"); + return -EINVAL; + } + + rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp, &num_invalidated_mw); + if (rc) + return rc; + + /* Send destroy requester ramrod */ + rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp, &num_bound_mw); + if (rc) + return rc; + + if (num_invalidated_mw != num_bound_mw) { + DP_NOTICE(p_hwfn, + "number of invalidate memory windows is different from bounded ones\n"); + return -EINVAL; + } + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + + start_cid = qed_cxt_get_proto_cid_start(p_hwfn, + p_hwfn->p_rdma_info->proto); + + /* Release responder's icid */ + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, + qp->icid - start_cid); + + /* Release requester's icid */ + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, + qp->icid + 1 - start_cid); + + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + + return 0; +} + +int qed_rdma_query_qp(void *rdma_cxt, + struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *out_params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + /* The following fields are filled in from qp and not FW as they can't + * be modified by FW + */ + out_params->mtu = qp->mtu; + out_params->dest_qp = qp->dest_qp; + out_params->incoming_atomic_en = qp->incoming_atomic_en; + out_params->e2e_flow_control_en = qp->e2e_flow_control_en; + out_params->incoming_rdma_read_en = qp->incoming_rdma_read_en; + out_params->incoming_rdma_write_en = qp->incoming_rdma_write_en; + out_params->dgid = qp->dgid; + out_params->flow_label = qp->flow_label; + out_params->hop_limit_ttl = qp->hop_limit_ttl; + out_params->traffic_class_tos = qp->traffic_class_tos; + out_params->timeout = qp->ack_timeout; + out_params->rnr_retry = qp->rnr_retry_cnt; + out_params->retry_cnt = qp->retry_cnt; + out_params->min_rnr_nak_timer = qp->min_rnr_nak_timer; + out_params->pkey_index = 0; + out_params->max_rd_atomic = qp->max_rd_atomic_req; + out_params->max_dest_rd_atomic = qp->max_rd_atomic_resp; + out_params->sqd_async = qp->sqd_async; + + rc = qed_roce_query_qp(p_hwfn, qp, out_params); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Query QP, rc = %d\n", rc); + return rc; +} + +int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + int rc = 0; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); + + rc = qed_roce_destroy_qp(p_hwfn, qp); + + /* free qp params struct */ + kfree(qp); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP destroyed\n"); + return rc; +} + +struct qed_rdma_qp * +qed_rdma_create_qp(void *rdma_cxt, + struct qed_rdma_create_qp_in_params *in_params, + struct qed_rdma_create_qp_out_params *out_params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct qed_rdma_qp *qp; + u8 max_stats_queues; + int rc; + + if (!rdma_cxt || !in_params || !out_params || !p_hwfn->p_rdma_info) { + DP_ERR(p_hwfn->cdev, + "qed roce create qp failed due to NULL entry (rdma_cxt=%p, in=%p, out=%p, roce_info=?\n", + rdma_cxt, in_params, out_params); + return NULL; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "qed rdma create qp called with qp_handle = %08x%08x\n", + in_params->qp_handle_hi, in_params->qp_handle_lo); + + /* Some sanity checks... */ + max_stats_queues = p_hwfn->p_rdma_info->dev->max_stats_queues; + if (in_params->stats_queue >= max_stats_queues) { + DP_ERR(p_hwfn->cdev, + "qed rdma create qp failed due to invalid statistics queue %d. maximum is %d\n", + in_params->stats_queue, max_stats_queues); + return NULL; + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + DP_NOTICE(p_hwfn, "Failed to allocate qed_rdma_qp\n"); + return NULL; + } + + rc = qed_roce_alloc_cid(p_hwfn, &qp->icid); + qp->qpid = ((0xFF << 16) | qp->icid); + + DP_INFO(p_hwfn, "ROCE qpid=%x\n", qp->qpid); + + if (rc) { + kfree(qp); + return NULL; + } + + qp->cur_state = QED_ROCE_QP_STATE_RESET; + qp->qp_handle.hi = cpu_to_le32(in_params->qp_handle_hi); + qp->qp_handle.lo = cpu_to_le32(in_params->qp_handle_lo); + qp->qp_handle_async.hi = cpu_to_le32(in_params->qp_handle_async_hi); + qp->qp_handle_async.lo = cpu_to_le32(in_params->qp_handle_async_lo); + qp->use_srq = in_params->use_srq; + qp->signal_all = in_params->signal_all; + qp->fmr_and_reserved_lkey = in_params->fmr_and_reserved_lkey; + qp->pd = in_params->pd; + qp->dpi = in_params->dpi; + qp->sq_cq_id = in_params->sq_cq_id; + qp->sq_num_pages = in_params->sq_num_pages; + qp->sq_pbl_ptr = in_params->sq_pbl_ptr; + qp->rq_cq_id = in_params->rq_cq_id; + qp->rq_num_pages = in_params->rq_num_pages; + qp->rq_pbl_ptr = in_params->rq_pbl_ptr; + qp->srq_id = in_params->srq_id; + qp->req_offloaded = false; + qp->resp_offloaded = false; + qp->e2e_flow_control_en = qp->use_srq ? false : true; + qp->stats_queue = in_params->stats_queue; + + out_params->icid = qp->icid; + out_params->qp_id = qp->qpid; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Create QP, rc = %d\n", rc); + return qp; +} + +static int qed_roce_modify_qp(struct qed_hwfn *p_hwfn, + struct qed_rdma_qp *qp, + enum qed_roce_qp_state prev_state, + struct qed_rdma_modify_qp_in_params *params) +{ + u32 num_invalidated_mw = 0, num_bound_mw = 0; + int rc = 0; + + /* Perform additional operations according to the current state and the + * next state + */ + if (((prev_state == QED_ROCE_QP_STATE_INIT) || + (prev_state == QED_ROCE_QP_STATE_RESET)) && + (qp->cur_state == QED_ROCE_QP_STATE_RTR)) { + /* Init->RTR or Reset->RTR */ + rc = qed_roce_sp_create_responder(p_hwfn, qp); + return rc; + } else if ((prev_state == QED_ROCE_QP_STATE_RTR) && + (qp->cur_state == QED_ROCE_QP_STATE_RTS)) { + /* RTR-> RTS */ + rc = qed_roce_sp_create_requester(p_hwfn, qp); + if (rc) + return rc; + + /* Send modify responder ramrod */ + rc = qed_roce_sp_modify_responder(p_hwfn, qp, false, + params->modify_flags); + return rc; + } else if ((prev_state == QED_ROCE_QP_STATE_RTS) && + (qp->cur_state == QED_ROCE_QP_STATE_RTS)) { + /* RTS->RTS */ + rc = qed_roce_sp_modify_responder(p_hwfn, qp, false, + params->modify_flags); + if (rc) + return rc; + + rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, false, + params->modify_flags); + return rc; + } else if ((prev_state == QED_ROCE_QP_STATE_RTS) && + (qp->cur_state == QED_ROCE_QP_STATE_SQD)) { + /* RTS->SQD */ + rc = qed_roce_sp_modify_requester(p_hwfn, qp, true, false, + params->modify_flags); + return rc; + } else if ((prev_state == QED_ROCE_QP_STATE_SQD) && + (qp->cur_state == QED_ROCE_QP_STATE_SQD)) { + /* SQD->SQD */ + rc = qed_roce_sp_modify_responder(p_hwfn, qp, false, + params->modify_flags); + if (rc) + return rc; + + rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, false, + params->modify_flags); + return rc; + } else if ((prev_state == QED_ROCE_QP_STATE_SQD) && + (qp->cur_state == QED_ROCE_QP_STATE_RTS)) { + /* SQD->RTS */ + rc = qed_roce_sp_modify_responder(p_hwfn, qp, false, + params->modify_flags); + if (rc) + return rc; + + rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, false, + params->modify_flags); + + return rc; + } else if (qp->cur_state == QED_ROCE_QP_STATE_ERR || + qp->cur_state == QED_ROCE_QP_STATE_SQE) { + /* ->ERR */ + rc = qed_roce_sp_modify_responder(p_hwfn, qp, true, + params->modify_flags); + if (rc) + return rc; + + rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, true, + params->modify_flags); + return rc; + } else if (qp->cur_state == QED_ROCE_QP_STATE_RESET) { + /* Any state -> RESET */ + + rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp, + &num_invalidated_mw); + if (rc) + return rc; + + rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp, + &num_bound_mw); + + if (num_invalidated_mw != num_bound_mw) { + DP_NOTICE(p_hwfn, + "number of invalidate memory windows is different from bounded ones\n"); + return -EINVAL; + } + } else { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "0\n"); + } + + return rc; +} + +int qed_rdma_modify_qp(void *rdma_cxt, + struct qed_rdma_qp *qp, + struct qed_rdma_modify_qp_in_params *params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + enum qed_roce_qp_state prev_state; + int rc = 0; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x params->new_state=%d\n", + qp->icid, params->new_state); + + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc); + return rc; + } + + if (GET_FIELD(params->modify_flags, + QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN)) { + qp->incoming_rdma_read_en = params->incoming_rdma_read_en; + qp->incoming_rdma_write_en = params->incoming_rdma_write_en; + qp->incoming_atomic_en = params->incoming_atomic_en; + } + + /* Update QP structure with the updated values */ + if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_ROCE_MODE)) + qp->roce_mode = params->roce_mode; + if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_PKEY)) + qp->pkey = params->pkey; + if (GET_FIELD(params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN)) + qp->e2e_flow_control_en = params->e2e_flow_control_en; + if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_DEST_QP)) + qp->dest_qp = params->dest_qp; + if (GET_FIELD(params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR)) { + /* Indicates that the following parameters have changed: + * Traffic class, flow label, hop limit, source GID, + * destination GID, loopback indicator + */ + qp->traffic_class_tos = params->traffic_class_tos; + qp->flow_label = params->flow_label; + qp->hop_limit_ttl = params->hop_limit_ttl; + + qp->sgid = params->sgid; + qp->dgid = params->dgid; + qp->udp_src_port = 0; + qp->vlan_id = params->vlan_id; + qp->mtu = params->mtu; + qp->lb_indication = params->lb_indication; + memcpy((u8 *)&qp->remote_mac_addr[0], + (u8 *)¶ms->remote_mac_addr[0], ETH_ALEN); + if (params->use_local_mac) { + memcpy((u8 *)&qp->local_mac_addr[0], + (u8 *)¶ms->local_mac_addr[0], ETH_ALEN); + } else { + memcpy((u8 *)&qp->local_mac_addr[0], + (u8 *)&p_hwfn->hw_info.hw_mac_addr, ETH_ALEN); + } + } + if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_RQ_PSN)) + qp->rq_psn = params->rq_psn; + if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_SQ_PSN)) + qp->sq_psn = params->sq_psn; + if (GET_FIELD(params->modify_flags, + QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ)) + qp->max_rd_atomic_req = params->max_rd_atomic_req; + if (GET_FIELD(params->modify_flags, + QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP)) + qp->max_rd_atomic_resp = params->max_rd_atomic_resp; + if (GET_FIELD(params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT)) + qp->ack_timeout = params->ack_timeout; + if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_RETRY_CNT)) + qp->retry_cnt = params->retry_cnt; + if (GET_FIELD(params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT)) + qp->rnr_retry_cnt = params->rnr_retry_cnt; + if (GET_FIELD(params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER)) + qp->min_rnr_nak_timer = params->min_rnr_nak_timer; + + qp->sqd_async = params->sqd_async; + + prev_state = qp->cur_state; + if (GET_FIELD(params->modify_flags, + QED_RDMA_MODIFY_QP_VALID_NEW_STATE)) { + qp->cur_state = params->new_state; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "qp->cur_state=%d\n", + qp->cur_state); + } + + rc = qed_roce_modify_qp(p_hwfn, qp, prev_state, params); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Modify QP, rc = %d\n", rc); + return rc; +} + static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev) { return QED_LEADING_HWFN(cdev); @@ -1201,6 +2394,10 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = { .rdma_dealloc_pd = &qed_rdma_free_pd, .rdma_create_cq = &qed_rdma_create_cq, .rdma_destroy_cq = &qed_rdma_destroy_cq, + .rdma_create_qp = &qed_rdma_create_qp, + .rdma_modify_qp = &qed_rdma_modify_qp, + .rdma_query_qp = &qed_rdma_query_qp, + .rdma_destroy_qp = &qed_rdma_destroy_qp, }; const struct qed_rdma_ops *qed_get_rdma_ops() diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h index 1fe73707e0b5..b8ddda456101 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.h +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h @@ -114,6 +114,72 @@ struct qed_rdma_resize_cnq_in_params { u64 pbl_ptr; }; +struct qed_rdma_qp { + struct regpair qp_handle; + struct regpair qp_handle_async; + u32 qpid; + u16 icid; + enum qed_roce_qp_state cur_state; + bool use_srq; + bool signal_all; + bool fmr_and_reserved_lkey; + + bool incoming_rdma_read_en; + bool incoming_rdma_write_en; + bool incoming_atomic_en; + bool e2e_flow_control_en; + + u16 pd; + u16 pkey; + u32 dest_qp; + u16 mtu; + u16 srq_id; + u8 traffic_class_tos; + u8 hop_limit_ttl; + u16 dpi; + u32 flow_label; + bool lb_indication; + u16 vlan_id; + u32 ack_timeout; + u8 retry_cnt; + u8 rnr_retry_cnt; + u8 min_rnr_nak_timer; + bool sqd_async; + union qed_gid sgid; + union qed_gid dgid; + enum roce_mode roce_mode; + u16 udp_src_port; + u8 stats_queue; + + /* requeseter */ + u8 max_rd_atomic_req; + u32 sq_psn; + u16 sq_cq_id; + u16 sq_num_pages; + dma_addr_t sq_pbl_ptr; + void *orq; + dma_addr_t orq_phys_addr; + u8 orq_num_pages; + bool req_offloaded; + + /* responder */ + u8 max_rd_atomic_resp; + u32 rq_psn; + u16 rq_cq_id; + u16 rq_num_pages; + dma_addr_t rq_pbl_ptr; + void *irq; + dma_addr_t irq_phys_addr; + u8 irq_num_pages; + bool resp_offloaded; + + u8 remote_mac_addr[6]; + u8 local_mac_addr[6]; + + void *shared_queue; + dma_addr_t shared_queue_phys_addr; +}; + int qed_rdma_add_user(void *rdma_cxt, struct qed_rdma_add_user_out_params *out_params); @@ -135,6 +201,11 @@ void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 cnq_index, u16 prod); void qed_rdma_resc_free(struct qed_hwfn *p_hwfn); void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe); +int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp); +int qed_rdma_modify_qp(void *rdma_cxt, struct qed_rdma_qp *qp, + struct qed_rdma_modify_qp_in_params *params); +int qed_rdma_query_qp(void *rdma_cxt, struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *out_params); #if IS_ENABLED(CONFIG_INFINIBAND_QEDR) void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index b559b1c9e76d..02321e3b1716 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -43,6 +43,17 @@ #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) /* rdma interface */ + +enum qed_roce_qp_state { + QED_ROCE_QP_STATE_RESET, + QED_ROCE_QP_STATE_INIT, + QED_ROCE_QP_STATE_RTR, + QED_ROCE_QP_STATE_RTS, + QED_ROCE_QP_STATE_SQD, + QED_ROCE_QP_STATE_ERR, + QED_ROCE_QP_STATE_SQE +}; + enum qed_rdma_tid_type { QED_RDMA_TID_REGISTERED_MR, QED_RDMA_TID_FMR, @@ -292,6 +303,128 @@ struct qed_rdma_destroy_cq_out_params { u16 num_cq_notif; }; +struct qed_rdma_create_qp_in_params { + u32 qp_handle_lo; + u32 qp_handle_hi; + u32 qp_handle_async_lo; + u32 qp_handle_async_hi; + bool use_srq; + bool signal_all; + bool fmr_and_reserved_lkey; + u16 pd; + u16 dpi; + u16 sq_cq_id; + u16 sq_num_pages; + u64 sq_pbl_ptr; + u8 max_sq_sges; + u16 rq_cq_id; + u16 rq_num_pages; + u64 rq_pbl_ptr; + u16 srq_id; + u8 stats_queue; +}; + +struct qed_rdma_create_qp_out_params { + u32 qp_id; + u16 icid; + void *rq_pbl_virt; + dma_addr_t rq_pbl_phys; + void *sq_pbl_virt; + dma_addr_t sq_pbl_phys; +}; + +struct qed_rdma_modify_qp_in_params { + u32 modify_flags; +#define QED_RDMA_MODIFY_QP_VALID_NEW_STATE_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_NEW_STATE_SHIFT 0 +#define QED_ROCE_MODIFY_QP_VALID_PKEY_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_PKEY_SHIFT 1 +#define QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN_SHIFT 2 +#define QED_ROCE_MODIFY_QP_VALID_DEST_QP_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_DEST_QP_SHIFT 3 +#define QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR_SHIFT 4 +#define QED_ROCE_MODIFY_QP_VALID_RQ_PSN_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_RQ_PSN_SHIFT 5 +#define QED_ROCE_MODIFY_QP_VALID_SQ_PSN_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_SQ_PSN_SHIFT 6 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ_SHIFT 7 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP_SHIFT 8 +#define QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT_SHIFT 9 +#define QED_ROCE_MODIFY_QP_VALID_RETRY_CNT_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_RETRY_CNT_SHIFT 10 +#define QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT_SHIFT 11 +#define QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER_SHIFT 12 +#define QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN_SHIFT 13 +#define QED_ROCE_MODIFY_QP_VALID_ROCE_MODE_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_ROCE_MODE_SHIFT 14 + + enum qed_roce_qp_state new_state; + u16 pkey; + bool incoming_rdma_read_en; + bool incoming_rdma_write_en; + bool incoming_atomic_en; + bool e2e_flow_control_en; + u32 dest_qp; + bool lb_indication; + u16 mtu; + u8 traffic_class_tos; + u8 hop_limit_ttl; + u32 flow_label; + union qed_gid sgid; + union qed_gid dgid; + u16 udp_src_port; + + u16 vlan_id; + + u32 rq_psn; + u32 sq_psn; + u8 max_rd_atomic_resp; + u8 max_rd_atomic_req; + u32 ack_timeout; + u8 retry_cnt; + u8 rnr_retry_cnt; + u8 min_rnr_nak_timer; + bool sqd_async; + u8 remote_mac_addr[6]; + u8 local_mac_addr[6]; + bool use_local_mac; + enum roce_mode roce_mode; +}; + +struct qed_rdma_query_qp_out_params { + enum qed_roce_qp_state state; + u32 rq_psn; + u32 sq_psn; + bool draining; + u16 mtu; + u32 dest_qp; + bool incoming_rdma_read_en; + bool incoming_rdma_write_en; + bool incoming_atomic_en; + bool e2e_flow_control_en; + union qed_gid sgid; + union qed_gid dgid; + u32 flow_label; + u8 hop_limit_ttl; + u8 traffic_class_tos; + u32 timeout; + u8 rnr_retry; + u8 retry_cnt; + u8 min_rnr_nak_timer; + u16 pkey_index; + u8 max_rd_atomic; + u8 max_dest_rd_atomic; + bool sqd_async; +}; + struct qed_rdma_create_srq_out_params { u16 srq_id; }; @@ -368,6 +501,17 @@ struct qed_rdma_ops { int (*rdma_destroy_cq)(void *rdma_cxt, struct qed_rdma_destroy_cq_in_params *iparams, struct qed_rdma_destroy_cq_out_params *oparams); + struct qed_rdma_qp * + (*rdma_create_qp)(void *rdma_cxt, + struct qed_rdma_create_qp_in_params *iparams, + struct qed_rdma_create_qp_out_params *oparams); + + int (*rdma_modify_qp)(void *roce_cxt, struct qed_rdma_qp *qp, + struct qed_rdma_modify_qp_in_params *iparams); + + int (*rdma_query_qp)(void *rdma_cxt, struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *oparams); + int (*rdma_destroy_qp)(void *rdma_cxt, struct qed_rdma_qp *qp); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From ee8eaea30b1368680f4d2f873bc14e1d7b57d021 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 22:00:00 +0300 Subject: qed: Add support for memory registeration verbs Add slowpath configuration support for user, dma and memory regions registration. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_roce.c | 240 +++++++++++++++++++++++++++++ include/linux/qed/qed_roce_if.h | 6 + 2 files changed, 246 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 438a0badc4e1..8b4854d3b395 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -689,6 +689,17 @@ struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) return p_hwfn->p_rdma_info->dev; } +void qed_rdma_free_tid(void *rdma_cxt, u32 itid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tid_map, itid); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); +} + int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; @@ -2300,6 +2311,231 @@ int qed_rdma_modify_qp(void *rdma_cxt, return rc; } +int qed_rdma_register_tid(void *rdma_cxt, + struct qed_rdma_register_tid_in_params *params) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct rdma_register_tid_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + enum rdma_tid_type tid_type; + u8 fw_return_code; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", params->itid); + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_REGISTER_MR, + p_hwfn->p_rdma_info->proto, &init_data); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc); + return rc; + } + + if (p_hwfn->p_rdma_info->last_tid < params->itid) + p_hwfn->p_rdma_info->last_tid = params->itid; + + p_ramrod = &p_ent->ramrod.rdma_register_tid; + + p_ramrod->flags = 0; + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_TWO_LEVEL_PBL, + params->pbl_two_level); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_ZERO_BASED, params->zbva); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_PHY_MR, params->phy_mr); + + /* Don't initialize D/C field, as it may override other bits. */ + if (!(params->tid_type == QED_RDMA_TID_FMR) && !(params->dma_mr)) + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_PAGE_SIZE_LOG, + params->page_size_log - 12); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_MAX_ID, + p_hwfn->p_rdma_info->last_tid); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_READ, + params->remote_read); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_WRITE, + params->remote_write); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_ATOMIC, + params->remote_atomic); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_WRITE, + params->local_write); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_READ, params->local_read); + + SET_FIELD(p_ramrod->flags, + RDMA_REGISTER_TID_RAMROD_DATA_ENABLE_MW_BIND, + params->mw_bind); + + SET_FIELD(p_ramrod->flags1, + RDMA_REGISTER_TID_RAMROD_DATA_PBL_PAGE_SIZE_LOG, + params->pbl_page_size_log - 12); + + SET_FIELD(p_ramrod->flags2, + RDMA_REGISTER_TID_RAMROD_DATA_DMA_MR, params->dma_mr); + + switch (params->tid_type) { + case QED_RDMA_TID_REGISTERED_MR: + tid_type = RDMA_TID_REGISTERED_MR; + break; + case QED_RDMA_TID_FMR: + tid_type = RDMA_TID_FMR; + break; + case QED_RDMA_TID_MW_TYPE1: + tid_type = RDMA_TID_MW_TYPE1; + break; + case QED_RDMA_TID_MW_TYPE2A: + tid_type = RDMA_TID_MW_TYPE2A; + break; + default: + rc = -EINVAL; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc); + return rc; + } + SET_FIELD(p_ramrod->flags1, + RDMA_REGISTER_TID_RAMROD_DATA_TID_TYPE, tid_type); + + p_ramrod->itid = cpu_to_le32(params->itid); + p_ramrod->key = params->key; + p_ramrod->pd = cpu_to_le16(params->pd); + p_ramrod->length_hi = (u8)(params->length >> 32); + p_ramrod->length_lo = DMA_LO_LE(params->length); + if (params->zbva) { + /* Lower 32 bits of the registered MR address. + * In case of zero based MR, will hold FBO + */ + p_ramrod->va.hi = 0; + p_ramrod->va.lo = cpu_to_le32(params->fbo); + } else { + DMA_REGPAIR_LE(p_ramrod->va, params->vaddr); + } + DMA_REGPAIR_LE(p_ramrod->pbl_base, params->pbl_ptr); + + /* DIF */ + if (params->dif_enabled) { + SET_FIELD(p_ramrod->flags2, + RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG, 1); + DMA_REGPAIR_LE(p_ramrod->dif_error_addr, + params->dif_error_addr); + DMA_REGPAIR_LE(p_ramrod->dif_runt_addr, params->dif_runt_addr); + } + + rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code); + + if (fw_return_code != RDMA_RETURN_OK) { + DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code); + return -EINVAL; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Register TID, rc = %d\n", rc); + return rc; +} + +int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + struct rdma_deregister_tid_ramrod_data *p_ramrod; + struct qed_sp_init_data init_data; + struct qed_spq_entry *p_ent; + struct qed_ptt *p_ptt; + u8 fw_return_code; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid); + + /* Get SPQ entry */ + memset(&init_data, 0, sizeof(init_data)); + init_data.opaque_fid = p_hwfn->hw_info.opaque_fid; + init_data.comp_mode = QED_SPQ_MODE_EBLOCK; + + rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_DEREGISTER_MR, + p_hwfn->p_rdma_info->proto, &init_data); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc); + return rc; + } + + p_ramrod = &p_ent->ramrod.rdma_deregister_tid; + p_ramrod->itid = cpu_to_le32(itid); + + rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc); + return rc; + } + + if (fw_return_code == RDMA_RETURN_DEREGISTER_MR_BAD_STATE_ERR) { + DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code); + return -EINVAL; + } else if (fw_return_code == RDMA_RETURN_NIG_DRAIN_REQ) { + /* Bit indicating that the TID is in use and a nig drain is + * required before sending the ramrod again + */ + p_ptt = qed_ptt_acquire(p_hwfn); + if (!p_ptt) { + rc = -EBUSY; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to acquire PTT\n"); + return rc; + } + + rc = qed_mcp_drain(p_hwfn, p_ptt); + if (rc) { + qed_ptt_release(p_hwfn, p_ptt); + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Drain failed\n"); + return rc; + } + + qed_ptt_release(p_hwfn, p_ptt); + + /* Resend the ramrod */ + rc = qed_sp_init_request(p_hwfn, &p_ent, + RDMA_RAMROD_DEREGISTER_MR, + p_hwfn->p_rdma_info->proto, + &init_data); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to init sp-element\n"); + return rc; + } + + rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Ramrod failed\n"); + return rc; + } + + if (fw_return_code != RDMA_RETURN_OK) { + DP_NOTICE(p_hwfn, "fw_return_code = %d\n", + fw_return_code); + return rc; + } + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "De-registered TID, rc = %d\n", rc); + return rc; +} + static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev) { return QED_LEADING_HWFN(cdev); @@ -2398,6 +2634,10 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = { .rdma_modify_qp = &qed_rdma_modify_qp, .rdma_query_qp = &qed_rdma_query_qp, .rdma_destroy_qp = &qed_rdma_destroy_qp, + .rdma_alloc_tid = &qed_rdma_alloc_tid, + .rdma_free_tid = &qed_rdma_free_tid, + .rdma_register_tid = &qed_rdma_register_tid, + .rdma_deregister_tid = &qed_rdma_deregister_tid, }; const struct qed_rdma_ops *qed_get_rdma_ops() diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index 02321e3b1716..0b6df6eedcf1 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -512,6 +512,12 @@ struct qed_rdma_ops { int (*rdma_query_qp)(void *rdma_cxt, struct qed_rdma_qp *qp, struct qed_rdma_query_qp_out_params *oparams); int (*rdma_destroy_qp)(void *rdma_cxt, struct qed_rdma_qp *qp); + int + (*rdma_register_tid)(void *rdma_cxt, + struct qed_rdma_register_tid_in_params *iparams); + int (*rdma_deregister_tid)(void *rdma_cxt, u32 itid); + int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid); + void (*rdma_free_tid)(void *rdma_cxt, u32 itid); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From abd49676c70793ee0a251bc3d8fe1604f9303210 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 22:00:01 +0300 Subject: qed: Add RoCE ll2 & GSI support Add the RoCE-specific LL2 logic [as well as GSI support] over the 'generic' LL2 interface. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed.h | 1 + drivers/net/ethernet/qlogic/qed/qed_hsi.h | 3 + drivers/net/ethernet/qlogic/qed/qed_ll2.c | 115 +++++++++-- drivers/net/ethernet/qlogic/qed/qed_ll2.h | 29 ++- drivers/net/ethernet/qlogic/qed/qed_roce.c | 307 +++++++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_roce.h | 1 + include/linux/qed/qed_roce_if.h | 79 ++++++++ 7 files changed, 523 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index c5a098a2ca2c..653bb5735f0c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -437,6 +437,7 @@ struct qed_hwfn { #endif struct z_stream_s *stream; + struct qed_roce_ll2_info *ll2; }; struct pci_params { diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 2777d5bb4380..72eee29c677f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -727,6 +727,9 @@ struct core_tx_bd_flags { #define CORE_TX_BD_FLAGS_L4_PROTOCOL_SHIFT 6 #define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_MASK 0x1 #define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_SHIFT 7 +#define CORE_TX_BD_FLAGS_ROCE_FLAV_MASK 0x1 +#define CORE_TX_BD_FLAGS_ROCE_FLAV_SHIFT 12 + }; struct core_tx_bd { diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index e0ec8ed2f92c..a6db10717d5c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -278,6 +278,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle) struct qed_ll2_tx_packet *p_pkt = NULL; struct qed_ll2_info *p_ll2_conn; struct qed_ll2_tx_queue *p_tx; + dma_addr_t tx_frag; p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle); if (!p_ll2_conn) @@ -297,11 +298,22 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle) p_tx->cur_completing_packet = *p_pkt; p_tx->cur_completing_bd_idx = 1; b_last_frag = p_tx->cur_completing_bd_idx == p_pkt->bd_used; + tx_frag = p_pkt->bds_set[0].tx_frag; + if (p_ll2_conn->gsi_enable) + qed_ll2b_release_tx_gsi_packet(p_hwfn, + p_ll2_conn->my_id, + p_pkt->cookie, + tx_frag, + b_last_frag, + b_last_packet); + else + qed_ll2b_complete_tx_packet(p_hwfn, + p_ll2_conn->my_id, + p_pkt->cookie, + tx_frag, + b_last_frag, + b_last_packet); - qed_ll2b_complete_tx_packet(p_hwfn, p_ll2_conn->my_id, - p_pkt->cookie, - p_pkt->bds_set[0].tx_frag, - b_last_frag, b_last_packet); } } @@ -313,6 +325,7 @@ static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) struct qed_ll2_tx_packet *p_pkt; bool b_last_frag = false; unsigned long flags; + dma_addr_t tx_frag; int rc = -EINVAL; spin_lock_irqsave(&p_tx->lock, flags); @@ -353,11 +366,19 @@ static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) list_add_tail(&p_pkt->list_entry, &p_tx->free_descq); spin_unlock_irqrestore(&p_tx->lock, flags); - qed_ll2b_complete_tx_packet(p_hwfn, - p_ll2_conn->my_id, - p_pkt->cookie, - p_pkt->bds_set[0].tx_frag, - b_last_frag, !num_bds); + tx_frag = p_pkt->bds_set[0].tx_frag; + if (p_ll2_conn->gsi_enable) + qed_ll2b_complete_tx_gsi_packet(p_hwfn, + p_ll2_conn->my_id, + p_pkt->cookie, + tx_frag, + b_last_frag, !num_bds); + else + qed_ll2b_complete_tx_packet(p_hwfn, + p_ll2_conn->my_id, + p_pkt->cookie, + tx_frag, + b_last_frag, !num_bds); spin_lock_irqsave(&p_tx->lock, flags); } @@ -368,6 +389,54 @@ out: return rc; } +static int +qed_ll2_rxq_completion_gsi(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_info, + union core_rx_cqe_union *p_cqe, + unsigned long lock_flags, bool b_last_cqe) +{ + struct qed_ll2_rx_queue *p_rx = &p_ll2_info->rx_queue; + struct qed_ll2_rx_packet *p_pkt = NULL; + u16 packet_length, parse_flags, vlan; + u32 src_mac_addrhi; + u16 src_mac_addrlo; + + if (!list_empty(&p_rx->active_descq)) + p_pkt = list_first_entry(&p_rx->active_descq, + struct qed_ll2_rx_packet, list_entry); + if (!p_pkt) { + DP_NOTICE(p_hwfn, + "GSI Rx completion but active_descq is empty\n"); + return -EIO; + } + + list_del(&p_pkt->list_entry); + parse_flags = le16_to_cpu(p_cqe->rx_cqe_gsi.parse_flags.flags); + packet_length = le16_to_cpu(p_cqe->rx_cqe_gsi.data_length); + vlan = le16_to_cpu(p_cqe->rx_cqe_gsi.vlan); + src_mac_addrhi = le32_to_cpu(p_cqe->rx_cqe_gsi.src_mac_addrhi); + src_mac_addrlo = le16_to_cpu(p_cqe->rx_cqe_gsi.src_mac_addrlo); + if (qed_chain_consume(&p_rx->rxq_chain) != p_pkt->rxq_bd) + DP_NOTICE(p_hwfn, + "Mismatch between active_descq and the LL2 Rx chain\n"); + list_add_tail(&p_pkt->list_entry, &p_rx->free_descq); + + spin_unlock_irqrestore(&p_rx->lock, lock_flags); + qed_ll2b_complete_rx_gsi_packet(p_hwfn, + p_ll2_info->my_id, + p_pkt->cookie, + p_pkt->rx_buf_addr, + packet_length, + p_cqe->rx_cqe_gsi.data_length_error, + parse_flags, + vlan, + src_mac_addrhi, + src_mac_addrlo, b_last_cqe); + spin_lock_irqsave(&p_rx->lock, lock_flags); + + return 0; +} + static int qed_ll2_rxq_completion_reg(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_conn, union core_rx_cqe_union *p_cqe, @@ -429,6 +498,10 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie) DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n"); rc = -EINVAL; break; + case CORE_RX_CQE_TYPE_GSI_OFFLOAD: + rc = qed_ll2_rxq_completion_gsi(p_hwfn, p_ll2_conn, + cqe, flags, b_last_cqe); + break; case CORE_RX_CQE_TYPE_REGULAR: rc = qed_ll2_rxq_completion_reg(p_hwfn, p_ll2_conn, cqe, flags, b_last_cqe); @@ -527,6 +600,7 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn, } p_ramrod->action_on_error.error_type = action_on_error; + p_ramrod->gsi_offload_flag = p_ll2_conn->gsi_enable; return qed_spq_post(p_hwfn, p_ent, NULL); } @@ -589,6 +663,7 @@ static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn, DP_NOTICE(p_hwfn, "Unknown connection type: %d\n", conn_type); } + p_ramrod->gsi_offload_flag = p_ll2_conn->gsi_enable; return qed_spq_post(p_hwfn, p_ent, NULL); } @@ -775,6 +850,7 @@ int qed_ll2_acquire_connection(struct qed_hwfn *p_hwfn, p_ll2_info->tx_dest = p_params->tx_dest; p_ll2_info->ai_err_packet_too_big = p_params->ai_err_packet_too_big; p_ll2_info->ai_err_no_buf = p_params->ai_err_no_buf; + p_ll2_info->gsi_enable = p_params->gsi_enable; rc = qed_ll2_acquire_connection_rx(p_hwfn, p_ll2_info, rx_num_desc); if (rc) @@ -1026,6 +1102,7 @@ static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn, u16 vlan, u8 bd_flags, u16 l4_hdr_offset_w, + enum core_roce_flavor_type type, dma_addr_t first_frag, u16 first_frag_len) { @@ -1046,6 +1123,9 @@ static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn, DMA_REGPAIR_LE(start_bd->addr, first_frag); start_bd->nbytes = cpu_to_le16(first_frag_len); + SET_FIELD(start_bd->bd_flags.as_bitfield, CORE_TX_BD_FLAGS_ROCE_FLAV, + type); + DP_VERBOSE(p_hwfn, (NETIF_MSG_TX_QUEUED | QED_MSG_LL2), "LL2 [q 0x%02x cid 0x%08x type 0x%08x] Tx Producer at [0x%04x] - set with a %04x bytes %02x BDs buffer at %08x:%08x\n", @@ -1137,11 +1217,13 @@ int qed_ll2_prepare_tx_packet(struct qed_hwfn *p_hwfn, u16 vlan, u8 bd_flags, u16 l4_hdr_offset_w, + enum qed_ll2_roce_flavor_type qed_roce_flavor, dma_addr_t first_frag, u16 first_frag_len, void *cookie, u8 notify_fw) { struct qed_ll2_tx_packet *p_curp = NULL; struct qed_ll2_info *p_ll2_conn = NULL; + enum core_roce_flavor_type roce_flavor; struct qed_ll2_tx_queue *p_tx; struct qed_chain *p_tx_chain; unsigned long flags; @@ -1174,6 +1256,15 @@ int qed_ll2_prepare_tx_packet(struct qed_hwfn *p_hwfn, goto out; } + if (qed_roce_flavor == QED_LL2_ROCE) { + roce_flavor = CORE_ROCE; + } else if (qed_roce_flavor == QED_LL2_RROCE) { + roce_flavor = CORE_RROCE; + } else { + rc = -EINVAL; + goto out; + } + /* Prepare packet and BD, and perhaps send a doorbell to FW */ qed_ll2_prepare_tx_packet_set(p_hwfn, p_tx, p_curp, num_of_bds, first_frag, @@ -1181,6 +1272,7 @@ int qed_ll2_prepare_tx_packet(struct qed_hwfn *p_hwfn, qed_ll2_prepare_tx_packet_set_bd(p_hwfn, p_ll2_conn, p_curp, num_of_bds, CORE_TX_DEST_NW, vlan, bd_flags, l4_hdr_offset_w, + roce_flavor, first_frag, first_frag_len); qed_ll2_tx_packet_notify(p_hwfn, p_ll2_conn); @@ -1476,6 +1568,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params) ll2_info.rx_vlan_removal_en = params->rx_vlan_stripping; ll2_info.tx_tc = 0; ll2_info.tx_dest = CORE_TX_DEST_NW; + ll2_info.gsi_enable = 1; rc = qed_ll2_acquire_connection(QED_LEADING_HWFN(cdev), &ll2_info, QED_LL2_RX_SIZE, QED_LL2_TX_SIZE, @@ -1625,8 +1718,8 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb) rc = qed_ll2_prepare_tx_packet(QED_LEADING_HWFN(cdev), cdev->ll2->handle, 1 + skb_shinfo(skb)->nr_frags, - vlan, flags, 0, mapping, - skb->len, skb, 1); + vlan, flags, 0, 0 /* RoCE FLAVOR */, + mapping, skb->len, skb, 1); if (rc) goto err; diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h index a037c4845928..80a5dc2d652d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h @@ -24,6 +24,12 @@ #define QED_MAX_NUM_OF_LL2_CONNECTIONS (4) +enum qed_ll2_roce_flavor_type { + QED_LL2_ROCE, + QED_LL2_RROCE, + MAX_QED_LL2_ROCE_FLAVOR_TYPE +}; + enum qed_ll2_conn_type { QED_LL2_TYPE_RESERVED, QED_LL2_TYPE_ISCSI, @@ -119,6 +125,7 @@ struct qed_ll2_info { u8 tx_stats_en; struct qed_ll2_rx_queue rx_queue; struct qed_ll2_tx_queue tx_queue; + u8 gsi_enable; }; /** @@ -199,6 +206,7 @@ int qed_ll2_prepare_tx_packet(struct qed_hwfn *p_hwfn, u16 vlan, u8 bd_flags, u16 l4_hdr_offset_w, + enum qed_ll2_roce_flavor_type qed_roce_flavor, dma_addr_t first_frag, u16 first_frag_len, void *cookie, u8 notify_fw); @@ -285,5 +293,24 @@ void qed_ll2_setup(struct qed_hwfn *p_hwfn, */ void qed_ll2_free(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_connections); - +void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t rx_buf_addr, + u16 data_length, + u8 data_length_error, + u16 parse_flags, + u16 vlan, + u32 src_mac_addr_hi, + u16 src_mac_addr_lo, bool b_last_packet); +void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet); +void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 8b4854d3b395..23430059471c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -64,6 +64,7 @@ #include "qed_reg_addr.h" #include "qed_sp.h" #include "qed_roce.h" +#include "qed_ll2.h" void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) @@ -2611,6 +2612,306 @@ void qed_rdma_remove_user(void *rdma_cxt, u16 dpi) spin_unlock_bh(&p_hwfn->p_rdma_info->lock); } +void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet) +{ + struct qed_roce_ll2_packet *packet = cookie; + struct qed_roce_ll2_info *roce_ll2 = p_hwfn->ll2; + + roce_ll2->cbs.tx_cb(roce_ll2->cb_cookie, packet); +} + +void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet) +{ + qed_ll2b_complete_tx_gsi_packet(p_hwfn, connection_handle, + cookie, first_frag_addr, + b_last_fragment, b_last_packet); +} + +void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t rx_buf_addr, + u16 data_length, + u8 data_length_error, + u16 parse_flags, + u16 vlan, + u32 src_mac_addr_hi, + u16 src_mac_addr_lo, bool b_last_packet) +{ + struct qed_roce_ll2_info *roce_ll2 = p_hwfn->ll2; + struct qed_roce_ll2_rx_params params; + struct qed_dev *cdev = p_hwfn->cdev; + struct qed_roce_ll2_packet pkt; + + DP_VERBOSE(cdev, + QED_MSG_LL2, + "roce ll2 rx complete: bus_addr=%p, len=%d, data_len_err=%d\n", + (void *)(uintptr_t)rx_buf_addr, + data_length, data_length_error); + + memset(&pkt, 0, sizeof(pkt)); + pkt.n_seg = 1; + pkt.payload[0].baddr = rx_buf_addr; + pkt.payload[0].len = data_length; + + memset(¶ms, 0, sizeof(params)); + params.vlan_id = vlan; + *((u32 *)¶ms.smac[0]) = ntohl(src_mac_addr_hi); + *((u16 *)¶ms.smac[4]) = ntohs(src_mac_addr_lo); + + if (data_length_error) { + DP_ERR(cdev, + "roce ll2 rx complete: data length error %d, length=%d\n", + data_length_error, data_length); + params.rc = -EINVAL; + } + + roce_ll2->cbs.rx_cb(roce_ll2->cb_cookie, &pkt, ¶ms); +} + +static int qed_roce_ll2_set_mac_filter(struct qed_dev *cdev, + u8 *old_mac_address, + u8 *new_mac_address) +{ + struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); + struct qed_ptt *p_ptt; + int rc = 0; + + if (!hwfn->ll2 || hwfn->ll2->handle == QED_LL2_UNUSED_HANDLE) { + DP_ERR(cdev, + "qed roce mac filter failed - roce_info/ll2 NULL\n"); + return -EINVAL; + } + + p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev)); + if (!p_ptt) { + DP_ERR(cdev, + "qed roce ll2 mac filter set: failed to acquire PTT\n"); + return -EINVAL; + } + + mutex_lock(&hwfn->ll2->lock); + if (old_mac_address) + qed_llh_remove_mac_filter(QED_LEADING_HWFN(cdev), p_ptt, + old_mac_address); + if (new_mac_address) + rc = qed_llh_add_mac_filter(QED_LEADING_HWFN(cdev), p_ptt, + new_mac_address); + mutex_unlock(&hwfn->ll2->lock); + + qed_ptt_release(QED_LEADING_HWFN(cdev), p_ptt); + + if (rc) + DP_ERR(cdev, + "qed roce ll2 mac filter set: failed to add mac filter\n"); + + return rc; +} + +static int qed_roce_ll2_start(struct qed_dev *cdev, + struct qed_roce_ll2_params *params) +{ + struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); + struct qed_roce_ll2_info *roce_ll2; + struct qed_ll2_info ll2_params; + int rc; + + if (!params) { + DP_ERR(cdev, "qed roce ll2 start: failed due to NULL params\n"); + return -EINVAL; + } + if (!params->cbs.tx_cb || !params->cbs.rx_cb) { + DP_ERR(cdev, + "qed roce ll2 start: failed due to NULL tx/rx. tx_cb=%p, rx_cb=%p\n", + params->cbs.tx_cb, params->cbs.rx_cb); + return -EINVAL; + } + if (!is_valid_ether_addr(params->mac_address)) { + DP_ERR(cdev, + "qed roce ll2 start: failed due to invalid Ethernet address %pM\n", + params->mac_address); + return -EINVAL; + } + + /* Initialize */ + roce_ll2 = kzalloc(sizeof(*roce_ll2), GFP_ATOMIC); + if (!roce_ll2) { + DP_ERR(cdev, "qed roce ll2 start: failed memory allocation\n"); + return -ENOMEM; + } + memset(roce_ll2, 0, sizeof(*roce_ll2)); + roce_ll2->handle = QED_LL2_UNUSED_HANDLE; + roce_ll2->cbs = params->cbs; + roce_ll2->cb_cookie = params->cb_cookie; + mutex_init(&roce_ll2->lock); + + memset(&ll2_params, 0, sizeof(ll2_params)); + ll2_params.conn_type = QED_LL2_TYPE_ROCE; + ll2_params.mtu = params->mtu; + ll2_params.rx_drop_ttl0_flg = true; + ll2_params.rx_vlan_removal_en = false; + ll2_params.tx_dest = CORE_TX_DEST_NW; + ll2_params.ai_err_packet_too_big = LL2_DROP_PACKET; + ll2_params.ai_err_no_buf = LL2_DROP_PACKET; + ll2_params.gsi_enable = true; + + rc = qed_ll2_acquire_connection(QED_LEADING_HWFN(cdev), &ll2_params, + params->max_rx_buffers, + params->max_tx_buffers, + &roce_ll2->handle); + if (rc) { + DP_ERR(cdev, + "qed roce ll2 start: failed to acquire LL2 connection (rc=%d)\n", + rc); + goto err; + } + + rc = qed_ll2_establish_connection(QED_LEADING_HWFN(cdev), + roce_ll2->handle); + if (rc) { + DP_ERR(cdev, + "qed roce ll2 start: failed to establish LL2 connection (rc=%d)\n", + rc); + goto err1; + } + + hwfn->ll2 = roce_ll2; + + rc = qed_roce_ll2_set_mac_filter(cdev, NULL, params->mac_address); + if (rc) { + hwfn->ll2 = NULL; + goto err2; + } + ether_addr_copy(roce_ll2->mac_address, params->mac_address); + + return 0; + +err2: + qed_ll2_terminate_connection(QED_LEADING_HWFN(cdev), roce_ll2->handle); +err1: + qed_ll2_release_connection(QED_LEADING_HWFN(cdev), roce_ll2->handle); +err: + kfree(roce_ll2); + return rc; +} + +static int qed_roce_ll2_stop(struct qed_dev *cdev) +{ + struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); + struct qed_roce_ll2_info *roce_ll2 = hwfn->ll2; + int rc; + + if (!cdev) { + DP_ERR(cdev, "qed roce ll2 stop: invalid cdev\n"); + return -EINVAL; + } + + if (roce_ll2->handle == QED_LL2_UNUSED_HANDLE) { + DP_ERR(cdev, "qed roce ll2 stop: cannot stop an unused LL2\n"); + return -EINVAL; + } + + /* remove LL2 MAC address filter */ + rc = qed_roce_ll2_set_mac_filter(cdev, roce_ll2->mac_address, NULL); + eth_zero_addr(roce_ll2->mac_address); + + rc = qed_ll2_terminate_connection(QED_LEADING_HWFN(cdev), + roce_ll2->handle); + if (rc) + DP_ERR(cdev, + "qed roce ll2 stop: failed to terminate LL2 connection (rc=%d)\n", + rc); + + qed_ll2_release_connection(QED_LEADING_HWFN(cdev), roce_ll2->handle); + + roce_ll2->handle = QED_LL2_UNUSED_HANDLE; + + kfree(roce_ll2); + + return rc; +} + +static int qed_roce_ll2_tx(struct qed_dev *cdev, + struct qed_roce_ll2_packet *pkt, + struct qed_roce_ll2_tx_params *params) +{ + struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); + struct qed_roce_ll2_info *roce_ll2 = hwfn->ll2; + enum qed_ll2_roce_flavor_type qed_roce_flavor; + u8 flags = 0; + int rc; + int i; + + if (!cdev || !pkt || !params) { + DP_ERR(cdev, + "roce ll2 tx: failed tx because one of the following is NULL - drv=%p, pkt=%p, params=%p\n", + cdev, pkt, params); + return -EINVAL; + } + + qed_roce_flavor = (pkt->roce_mode == ROCE_V1) ? QED_LL2_ROCE + : QED_LL2_RROCE; + + if (pkt->roce_mode == ROCE_V2_IPV4) + flags |= BIT(CORE_TX_BD_FLAGS_IP_CSUM_SHIFT); + + /* Tx header */ + rc = qed_ll2_prepare_tx_packet(QED_LEADING_HWFN(cdev), roce_ll2->handle, + 1 + pkt->n_seg, 0, flags, 0, + qed_roce_flavor, pkt->header.baddr, + pkt->header.len, pkt, 1); + if (rc) { + DP_ERR(cdev, "roce ll2 tx: header failed (rc=%d)\n", rc); + return QED_ROCE_TX_HEAD_FAILURE; + } + + /* Tx payload */ + for (i = 0; i < pkt->n_seg; i++) { + rc = qed_ll2_set_fragment_of_tx_packet(QED_LEADING_HWFN(cdev), + roce_ll2->handle, + pkt->payload[i].baddr, + pkt->payload[i].len); + if (rc) { + /* If failed not much to do here, partial packet has + * been posted * we can't free memory, will need to wait + * for completion + */ + DP_ERR(cdev, + "roce ll2 tx: payload failed (rc=%d)\n", rc); + return QED_ROCE_TX_FRAG_FAILURE; + } + } + + return 0; +} + +static int qed_roce_ll2_post_rx_buffer(struct qed_dev *cdev, + struct qed_roce_ll2_buffer *buf, + u64 cookie, u8 notify_fw) +{ + return qed_ll2_post_rx_buffer(QED_LEADING_HWFN(cdev), + QED_LEADING_HWFN(cdev)->ll2->handle, + buf->baddr, buf->len, + (void *)(uintptr_t)cookie, notify_fw); +} + +static int qed_roce_ll2_stats(struct qed_dev *cdev, struct qed_ll2_stats *stats) +{ + struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); + struct qed_roce_ll2_info *roce_ll2 = hwfn->ll2; + + return qed_ll2_get_stats(QED_LEADING_HWFN(cdev), + roce_ll2->handle, stats); +} + static const struct qed_rdma_ops qed_rdma_ops_pass = { .common = &qed_common_ops_pass, .fill_dev_info = &qed_fill_rdma_dev_info, @@ -2638,6 +2939,12 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = { .rdma_free_tid = &qed_rdma_free_tid, .rdma_register_tid = &qed_rdma_register_tid, .rdma_deregister_tid = &qed_rdma_deregister_tid, + .roce_ll2_start = &qed_roce_ll2_start, + .roce_ll2_stop = &qed_roce_ll2_stop, + .roce_ll2_tx = &qed_roce_ll2_tx, + .roce_ll2_post_rx_buffer = &qed_roce_ll2_post_rx_buffer, + .roce_ll2_set_mac_filter = &qed_roce_ll2_set_mac_filter, + .roce_ll2_stats = &qed_roce_ll2_stats, }; const struct qed_rdma_ops *qed_get_rdma_ops() diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h index b8ddda456101..2f091e8a0f40 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.h +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h @@ -42,6 +42,7 @@ #include "qed.h" #include "qed_dev_api.h" #include "qed_hsi.h" +#include "qed_ll2.h" #define QED_RDMA_MAX_FMR (RDMA_MAX_TIDS) #define QED_RDMA_MAX_P_KEY (1) diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index 0b6df6eedcf1..53047d3fa678 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -39,6 +39,16 @@ #include #include #include +#include + +enum qed_roce_ll2_tx_dest { + /* Light L2 TX Destination to the Network */ + QED_ROCE_LL2_TX_DEST_NW, + + /* Light L2 TX Destination to the Loopback */ + QED_ROCE_LL2_TX_DEST_LB, + QED_ROCE_LL2_TX_DEST_MAX +}; #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) @@ -461,6 +471,61 @@ struct qed_rdma_counters_out_params { #define QED_ROCE_TX_HEAD_FAILURE (1) #define QED_ROCE_TX_FRAG_FAILURE (2) +struct qed_roce_ll2_header { + void *vaddr; + dma_addr_t baddr; + size_t len; +}; + +struct qed_roce_ll2_buffer { + dma_addr_t baddr; + size_t len; +}; + +struct qed_roce_ll2_packet { + struct qed_roce_ll2_header header; + int n_seg; + struct qed_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE]; + int roce_mode; + enum qed_roce_ll2_tx_dest tx_dest; +}; + +struct qed_roce_ll2_tx_params { + int reserved; +}; + +struct qed_roce_ll2_rx_params { + u16 vlan_id; + u8 smac[ETH_ALEN]; + int rc; +}; + +struct qed_roce_ll2_cbs { + void (*tx_cb)(void *pdev, struct qed_roce_ll2_packet *pkt); + + void (*rx_cb)(void *pdev, struct qed_roce_ll2_packet *pkt, + struct qed_roce_ll2_rx_params *params); +}; + +struct qed_roce_ll2_params { + u16 max_rx_buffers; + u16 max_tx_buffers; + u16 mtu; + u8 mac_address[ETH_ALEN]; + struct qed_roce_ll2_cbs cbs; + void *cb_cookie; +}; + +struct qed_roce_ll2_info { + u8 handle; + struct qed_roce_ll2_cbs cbs; + u8 mac_address[ETH_ALEN]; + void *cb_cookie; + + /* Lock to protect ll2 */ + struct mutex lock; +}; + enum qed_rdma_type { QED_RDMA_TYPE_ROCE, }; @@ -518,6 +583,20 @@ struct qed_rdma_ops { int (*rdma_deregister_tid)(void *rdma_cxt, u32 itid); int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid); void (*rdma_free_tid)(void *rdma_cxt, u32 itid); + int (*roce_ll2_start)(struct qed_dev *cdev, + struct qed_roce_ll2_params *params); + int (*roce_ll2_stop)(struct qed_dev *cdev); + int (*roce_ll2_tx)(struct qed_dev *cdev, + struct qed_roce_ll2_packet *packet, + struct qed_roce_ll2_tx_params *params); + int (*roce_ll2_post_rx_buffer)(struct qed_dev *cdev, + struct qed_roce_ll2_buffer *buf, + u64 cookie, u8 notify_fw); + int (*roce_ll2_set_mac_filter)(struct qed_dev *cdev, + u8 *old_mac_address, + u8 *new_mac_address); + int (*roce_ll2_stats)(struct qed_dev *cdev, + struct qed_ll2_stats *stats); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From a4cc96d1f0170b779c32c6b2cc58764f5d2cdef0 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Mon, 3 Oct 2016 12:53:13 +0530 Subject: net: phy: Add Edge-rate driver for Microsemi PHYs. Edge-rate: As system and networking speeds increase, a signal's output transition, also know as the edge rate or slew rate (V/ns), takes on greater importance because high-speed signals come with a price. That price is an assortment of interference problems like ringing on the line, signal overshoot and undershoot, extended signal settling times, crosstalk noise, transmission line reflections, false signal detection by the receiving device and electromagnetic interference (EMI) -- all of which can negate the potential gains designers are seeking when they try to increase system speeds through the use of higher performance logic devices. The fact is, faster signaling edge rates can cause a higher level of electrical noise or other type of interference that can actually lead to slower line speeds and lower maximum system frequencies. This parameter allow the board designers to change the driving strange, and thereby change the EMI behavioral. Edge-rate parameters (vddmac, edge-slowdown) get from Device Tree. Tested on Beaglebone Black with VSC 8531 PHY. Signed-off-by: Raju Lakkaraju Signed-off-by: David S. Miller --- .../devicetree/bindings/net/mscc-phy-vsc8531.txt | 58 ++++++++++ drivers/net/phy/mscc.c | 125 +++++++++++++++++++++ include/dt-bindings/net/mscc-phy-vsc8531.h | 21 ++++ 3 files changed, 204 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt create mode 100644 include/dt-bindings/net/mscc-phy-vsc8531.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt new file mode 100644 index 000000000000..99c7eb0a00c8 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt @@ -0,0 +1,58 @@ +* Microsemi - vsc8531 Giga bit ethernet phy + +Required properties: +- compatible : Should contain phy id as "ethernet-phy-idAAAA.BBBB" + The PHY device uses the binding described in + Documentation/devicetree/bindings/net/phy.txt + +Optional properties: +- vsc8531,vddmac : The vddmac in mV. +- vsc8531,edge-slowdown : % the edge should be slowed down relative to + the fastest possible edge time. Native sign + need not enter. + Edge rate sets the drive strength of the MAC + interface output signals. Changing the drive + strength will affect the edge rate of the output + signal. The goal of this setting is to help + reduce electrical emission (EMI) by being able + to reprogram drive strength and in effect slow + down the edge rate if desired. Table 1 shows the + impact to the edge rate per VDDMAC supply for each + drive strength setting. + Ref: Table:1 - Edge rate change below. + +Note: see dt-bindings/net/mscc-phy-vsc8531.h for applicable values + +Table: 1 - Edge rate change +----------------------------------------------------------------| +| Edge Rate Change (VDDMAC) | +| | +| 3300 mV 2500 mV 1800 mV 1500 mV | +|---------------------------------------------------------------| +| Default Deafult Default Default | +| (Fastest) (recommended) (recommended) | +|---------------------------------------------------------------| +| -2% -3% -5% -6% | +|---------------------------------------------------------------| +| -4% -6% -9% -14% | +|---------------------------------------------------------------| +| -7% -10% -16% -21% | +|(recommended) (recommended) | +|---------------------------------------------------------------| +| -10% -14% -23% -29% | +|---------------------------------------------------------------| +| -17% -23% -35% -42% | +|---------------------------------------------------------------| +| -29% -37% -52% -58% | +|---------------------------------------------------------------| +| -53% -63% -76% -77% | +| (slowest) | +|---------------------------------------------------------------| + +Example: + + vsc8531_0: ethernet-phy@0 { + compatible = "ethernet-phy-id0007.0570"; + vsc8531,vddmac = <3300>; + vsc8531,edge-slowdown = <21>; + }; diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c index d350debd174a..a17573e3bd8a 100644 --- a/drivers/net/phy/mscc.c +++ b/drivers/net/phy/mscc.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include enum rgmii_rx_clock_delay { RGMII_RX_CLK_DELAY_0_2_NS = 0, @@ -37,6 +39,10 @@ enum rgmii_rx_clock_delay { #define MII_VSC85XX_INT_MASK_MASK 0xa000 #define MII_VSC85XX_INT_STATUS 26 +#define MSCC_PHY_WOL_MAC_CONTROL 27 +#define EDGE_RATE_CNTL_POS 5 +#define EDGE_RATE_CNTL_MASK 0x00E0 + #define MSCC_EXT_PAGE_ACCESS 31 #define MSCC_PHY_PAGE_STANDARD 0x0000 /* Standard registers */ #define MSCC_PHY_PAGE_EXTENDED_2 0x0002 /* Extended reg - page 2 */ @@ -50,6 +56,23 @@ enum rgmii_rx_clock_delay { #define PHY_ID_VSC8531 0x00070570 #define PHY_ID_VSC8541 0x00070770 +struct edge_rate_table { + u16 vddmac; + int slowdown[MSCC_SLOWDOWN_MAX]; +}; + +struct edge_rate_table edge_table[MSCC_VDDMAC_MAX] = { + {3300, { 0, -2, -4, -7, -10, -17, -29, -53} }, + {2500, { 0, -3, -6, -10, -14, -23, -37, -63} }, + {1800, { 0, -5, -9, -16, -23, -35, -52, -76} }, + {1500, { 0, -6, -14, -21, -29, -42, -58, -77} }, +}; + +struct vsc8531_private { + u8 edge_slowdown; + u16 vddmac; +}; + static int vsc85xx_phy_page_set(struct phy_device *phydev, u8 page) { int rc; @@ -58,6 +81,51 @@ static int vsc85xx_phy_page_set(struct phy_device *phydev, u8 page) return rc; } +static u8 edge_rate_magic_get(u16 vddmac, + int slowdown) +{ + int rc = (MSCC_SLOWDOWN_MAX - 1); + u8 vdd; + u8 sd; + + for (vdd = 0; vdd < MSCC_VDDMAC_MAX; vdd++) { + if (edge_table[vdd].vddmac == vddmac) { + for (sd = 0; sd < MSCC_SLOWDOWN_MAX; sd++) { + if (edge_table[vdd].slowdown[sd] <= slowdown) { + rc = (MSCC_SLOWDOWN_MAX - sd - 1); + break; + } + } + } + } + + return rc; +} + +static int vsc85xx_edge_rate_cntl_set(struct phy_device *phydev, + u8 edge_rate) +{ + int rc; + u16 reg_val; + + mutex_lock(&phydev->lock); + rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2); + if (rc != 0) + goto out_unlock; + reg_val = phy_read(phydev, MSCC_PHY_WOL_MAC_CONTROL); + reg_val &= ~(EDGE_RATE_CNTL_MASK); + reg_val |= (edge_rate << EDGE_RATE_CNTL_POS); + rc = phy_write(phydev, MSCC_PHY_WOL_MAC_CONTROL, reg_val); + if (rc != 0) + goto out_unlock; + rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD); + +out_unlock: + mutex_unlock(&phydev->lock); + + return rc; +} + static int vsc85xx_mac_if_set(struct phy_device *phydev, phy_interface_t interface) { @@ -116,9 +184,45 @@ out_unlock: return rc; } +#ifdef CONFIG_OF_MDIO +static int vsc8531_of_init(struct phy_device *phydev) +{ + int rc; + struct vsc8531_private *vsc8531 = phydev->priv; + struct device *dev = &phydev->mdio.dev; + struct device_node *of_node = dev->of_node; + + if (!of_node) + return -ENODEV; + + rc = of_property_read_u16(of_node, "vsc8531,vddmac", + &vsc8531->vddmac); + if (rc == -EINVAL) + vsc8531->vddmac = MSCC_VDDMAC_3300; + rc = of_property_read_u8(of_node, "vsc8531,edge-slowdown", + &vsc8531->edge_slowdown); + if (rc == -EINVAL) + vsc8531->edge_slowdown = 0; + + rc = 0; + return rc; +} +#else +static int vsc8531_of_init(struct phy_device *phydev) +{ + return 0; +} +#endif /* CONFIG_OF_MDIO */ + static int vsc85xx_config_init(struct phy_device *phydev) { int rc; + struct vsc8531_private *vsc8531 = phydev->priv; + u8 edge_rate; + + rc = vsc8531_of_init(phydev); + if (rc) + return rc; rc = vsc85xx_default_config(phydev); if (rc) @@ -128,6 +232,12 @@ static int vsc85xx_config_init(struct phy_device *phydev) if (rc) return rc; + edge_rate = edge_rate_magic_get(vsc8531->vddmac, + -(int)vsc8531->edge_slowdown); + rc = vsc85xx_edge_rate_cntl_set(phydev, edge_rate); + if (rc) + return rc; + rc = genphy_config_init(phydev); return rc; @@ -160,6 +270,19 @@ static int vsc85xx_config_intr(struct phy_device *phydev) return rc; } +static int vsc85xx_probe(struct phy_device *phydev) +{ + struct vsc8531_private *vsc8531; + + vsc8531 = devm_kzalloc(&phydev->mdio.dev, sizeof(*vsc8531), GFP_KERNEL); + if (!vsc8531) + return -ENOMEM; + + phydev->priv = vsc8531; + + return 0; +} + /* Microsemi VSC85xx PHYs */ static struct phy_driver vsc85xx_driver[] = { { @@ -177,6 +300,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .probe = &vsc85xx_probe, }, { .phy_id = PHY_ID_VSC8541, @@ -193,6 +317,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .probe = &vsc85xx_probe, } }; diff --git a/include/dt-bindings/net/mscc-phy-vsc8531.h b/include/dt-bindings/net/mscc-phy-vsc8531.h new file mode 100644 index 000000000000..2383dd20ff43 --- /dev/null +++ b/include/dt-bindings/net/mscc-phy-vsc8531.h @@ -0,0 +1,21 @@ +/* + * Device Tree constants for Microsemi VSC8531 PHY + * + * Author: Nagaraju Lakkaraju + * + * License: Dual MIT/GPL + * Copyright (c) 2016 Microsemi Corporation + */ + +#ifndef _DT_BINDINGS_MSCC_VSC8531_H +#define _DT_BINDINGS_MSCC_VSC8531_H + +/* MAC interface Edge rate control VDDMAC in milli Volts */ +#define MSCC_VDDMAC_3300 3300 +#define MSCC_VDDMAC_2500 2500 +#define MSCC_VDDMAC_1800 1800 +#define MSCC_VDDMAC_1500 1500 +#define MSCC_VDDMAC_MAX 4 +#define MSCC_SLOWDOWN_MAX 8 + +#endif -- cgit v1.2.3 From c0cd1ba4f8bd8b5fef43bc51a2983673b8f086ff Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:53 +1100 Subject: net/ncsi: Introduce ncsi_stop_dev() This introduces ncsi_stop_dev(), as counterpart to ncsi_start_dev(), to stop the NCSI device so that it can be reenabled in future. This API should be called when the network device driver is going to shutdown the device. There are 3 things done in the function: Stop the channel monitoring; Reset channels to inactive state; Report NCSI link down. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- include/net/ncsi.h | 5 +++++ net/ncsi/ncsi-manage.c | 37 ++++++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/ncsi.h b/include/net/ncsi.h index 1dbf42f79750..68680baac0fd 100644 --- a/include/net/ncsi.h +++ b/include/net/ncsi.h @@ -31,6 +31,7 @@ struct ncsi_dev { struct ncsi_dev *ncsi_register_dev(struct net_device *dev, void (*notifier)(struct ncsi_dev *nd)); int ncsi_start_dev(struct ncsi_dev *nd); +void ncsi_stop_dev(struct ncsi_dev *nd); void ncsi_unregister_dev(struct ncsi_dev *nd); #else /* !CONFIG_NET_NCSI */ static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev, @@ -44,6 +45,10 @@ static inline int ncsi_start_dev(struct ncsi_dev *nd) return -ENOTTY; } +static void ncsi_stop_dev(struct ncsi_dev *nd) +{ +} + static inline void ncsi_unregister_dev(struct ncsi_dev *nd) { } diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 4742c7c6c748..5e509e547c2d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1187,11 +1187,7 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); - struct ncsi_package *np; - struct ncsi_channel *nc; - unsigned long flags; - bool chained; - int old_state, ret; + int ret; if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) @@ -1203,9 +1199,29 @@ int ncsi_start_dev(struct ncsi_dev *nd) return 0; } - /* Reset channel's state and start over */ + if (ndp->flags & NCSI_DEV_HWA) + ret = ncsi_enable_hwa(ndp); + else + ret = ncsi_choose_active_channel(ndp); + + return ret; +} +EXPORT_SYMBOL_GPL(ncsi_start_dev); + +void ncsi_stop_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np; + struct ncsi_channel *nc; + bool chained; + int old_state; + unsigned long flags; + + /* Stop the channel monitor and reset channel's state */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&nc->lock, flags); chained = !list_empty(&nc->link); old_state = nc->state; @@ -1217,14 +1233,9 @@ int ncsi_start_dev(struct ncsi_dev *nd) } } - if (ndp->flags & NCSI_DEV_HWA) - ret = ncsi_enable_hwa(ndp); - else - ret = ncsi_choose_active_channel(ndp); - - return ret; + ncsi_report_link(ndp, true); } -EXPORT_SYMBOL_GPL(ncsi_start_dev); +EXPORT_SYMBOL_GPL(ncsi_stop_dev); void ncsi_unregister_dev(struct ncsi_dev *nd) { -- cgit v1.2.3